Shortcuts

Source code for mmdet.apis.train

# Copyright (c) OpenMMLab. All rights reserved.
import os
import random

import numpy as np
import torch
import torch.distributed as dist
from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner,
                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
                         build_runner, get_dist_info)

from mmdet.core import DistEvalHook, EvalHook
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.utils import (build_ddp, build_dp, compat_cfg,
                         find_latest_checkpoint, get_root_logger)


[docs]def init_random_seed(seed=None, device='cuda'): """Initialize random seed. If the seed is not set, the seed will be automatically randomized, and then broadcast to all processes to prevent some potential bugs. Args: seed (int, Optional): The seed. Default to None. device (str): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is not None: return seed # Make sure all ranks share the same random seed to prevent # some potential bugs. Please refer to # https://github.com/open-mmlab/mmdetection/issues/6339 rank, world_size = get_dist_info() seed = np.random.randint(2**31) if world_size == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item()
[docs]def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False
def auto_scale_lr(cfg, distributed, logger): """Automatically scaling LR according to GPU number and sample per GPU. Args: cfg (config): Training config. distributed (bool): Using distributed or not. logger (logging.Logger): Logger. """ # Get flag from config if ('auto_scale_lr' not in cfg) or \ (not cfg.auto_scale_lr.get('enable', False)): logger.info('Automatic scaling of learning rate (LR)' ' has been disabled.') return # Get base batch size from config base_batch_size = cfg.auto_scale_lr.get('base_batch_size', None) if base_batch_size is None: return # Get gpu number if distributed: _, world_size = get_dist_info() num_gpus = len(range(world_size)) else: num_gpus = len(cfg.gpu_ids) # calculate the batch size samples_per_gpu = cfg.data.train_dataloader.samples_per_gpu batch_size = num_gpus * samples_per_gpu logger.info(f'Training with {num_gpus} GPU(s) with {samples_per_gpu} ' f'samples per GPU. The total batch size is {batch_size}.') if batch_size != base_batch_size: # scale LR with # [linear scaling rule](https://arxiv.org/abs/1706.02677) scaled_lr = (batch_size / base_batch_size) * cfg.optimizer.lr logger.info('LR has been automatically scaled ' f'from {cfg.optimizer.lr} to {scaled_lr}') cfg.optimizer.lr = scaled_lr else: logger.info('The batch size match the ' f'base batch size: {base_batch_size}, ' f'will not scaling the LR ({cfg.optimizer.lr}).') def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): cfg = compat_cfg(cfg) logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[ 'type'] train_dataloader_default_args = dict( samples_per_gpu=2, workers_per_gpu=2, # `num_gpus` will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, runner_type=runner_type, persistent_workers=False) train_loader_cfg = { **train_dataloader_default_args, **cfg.data.get('train_dataloader', {}) } data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = build_ddp( model, cfg.device, device_ids=[int(os.environ['LOCAL_RANK'])], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) # build optimizer auto_scale_lr(cfg, distributed, logger) optimizer = build_optimizer(model, cfg.optimizer) runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get('custom_hooks', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataloader_default_args = dict( samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False, persistent_workers=False) val_dataloader_args = { **val_dataloader_default_args, **cfg.data.get('val_dataloader', {}) } # Support batch_size > 1 in validation if val_dataloader_args['samples_per_gpu'] > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader(val_dataset, **val_dataloader_args) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. runner.register_hook( eval_hook(val_dataloader, **eval_cfg), priority='LOW') resume_from = None if cfg.resume_from is None and cfg.get('auto_resume'): resume_from = find_latest_checkpoint(cfg.work_dir) if resume_from is not None: cfg.resume_from = resume_from if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
Read the Docs v: v2.24.1
Versions
latest
stable
v2.24.1
v2.24.0
v2.23.0
v2.22.0
v2.21.0
v2.20.0
v2.19.1
v2.19.0
v2.18.1
v2.18.0
v2.17.0
v2.16.0
v2.15.1
v2.15.0
v2.14.0
v2.13.0
v2.12.0
v2.11.0
v2.10.0
v2.9.0
v2.8.0
v2.7.0
v2.6.0
v2.5.0
v2.4.0
v2.3.0
v2.2.1
v2.2.0
v2.1.0
v2.0.0
v1.2.0
dev
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.