| | |
| | import warnings |
| | from typing import Callable, List, Optional, Union |
| |
|
| | import torch |
| |
|
| | from ..dist_utils import master_only |
| | from .hook import HOOKS, Hook |
| |
|
| |
|
| | @HOOKS.register_module() |
| | class ProfilerHook(Hook): |
| | """Profiler to analyze performance during training. |
| | |
| | PyTorch Profiler is a tool that allows the collection of the performance |
| | metrics during the training. More details on Profiler can be found at |
| | https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile |
| | |
| | Args: |
| | by_epoch (bool): Profile performance by epoch or by iteration. |
| | Default: True. |
| | profile_iters (int): Number of iterations for profiling. |
| | If ``by_epoch=True``, profile_iters indicates that they are the |
| | first profile_iters epochs at the beginning of the |
| | training, otherwise it indicates the first profile_iters |
| | iterations. Default: 1. |
| | activities (list[str]): List of activity groups (CPU, CUDA) to use in |
| | profiling. Default: ['cpu', 'cuda']. |
| | schedule (dict, optional): Config of generating the callable schedule. |
| | if schedule is None, profiler will not add step markers into the |
| | trace and table view. Default: None. |
| | on_trace_ready (callable, dict): Either a handler or a dict of generate |
| | handler. Default: None. |
| | record_shapes (bool): Save information about operator's input shapes. |
| | Default: False. |
| | profile_memory (bool): Track tensor memory allocation/deallocation. |
| | Default: False. |
| | with_stack (bool): Record source information (file and line number) |
| | for the ops. Default: False. |
| | with_flops (bool): Use formula to estimate the FLOPS of specific |
| | operators (matrix multiplication and 2D convolution). |
| | Default: False. |
| | json_trace_path (str, optional): Exports the collected trace in Chrome |
| | JSON format. Default: None. |
| | |
| | Example: |
| | >>> runner = ... # instantiate a Runner |
| | >>> # tensorboard trace |
| | >>> trace_config = dict(type='tb_trace', dir_name='work_dir') |
| | >>> profiler_config = dict(on_trace_ready=trace_config) |
| | >>> runner.register_profiler_hook(profiler_config) |
| | >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)]) |
| | """ |
| |
|
| | def __init__(self, |
| | by_epoch: bool = True, |
| | profile_iters: int = 1, |
| | activities: List[str] = ['cpu', 'cuda'], |
| | schedule: Optional[dict] = None, |
| | on_trace_ready: Optional[Union[Callable, dict]] = None, |
| | record_shapes: bool = False, |
| | profile_memory: bool = False, |
| | with_stack: bool = False, |
| | with_flops: bool = False, |
| | json_trace_path: Optional[str] = None) -> None: |
| | try: |
| | from torch import profiler |
| | except ImportError: |
| | raise ImportError('profiler is the new feature of torch1.8.1, ' |
| | f'but your version is {torch.__version__}') |
| |
|
| | assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' |
| | self.by_epoch = by_epoch |
| |
|
| | if profile_iters < 1: |
| | raise ValueError('profile_iters should be greater than 0, but got ' |
| | f'{profile_iters}') |
| | self.profile_iters = profile_iters |
| |
|
| | if not isinstance(activities, list): |
| | raise ValueError( |
| | f'activities should be list, but got {type(activities)}') |
| | self.activities = [] |
| | for activity in activities: |
| | activity = activity.lower() |
| | if activity == 'cpu': |
| | self.activities.append(profiler.ProfilerActivity.CPU) |
| | elif activity == 'cuda': |
| | self.activities.append(profiler.ProfilerActivity.CUDA) |
| | else: |
| | raise ValueError( |
| | f'activity should be "cpu" or "cuda", but got {activity}') |
| |
|
| | if schedule is not None: |
| | self.schedule = profiler.schedule(**schedule) |
| | else: |
| | self.schedule = None |
| |
|
| | self.on_trace_ready = on_trace_ready |
| | self.record_shapes = record_shapes |
| | self.profile_memory = profile_memory |
| | self.with_stack = with_stack |
| | self.with_flops = with_flops |
| | self.json_trace_path = json_trace_path |
| |
|
| | @master_only |
| | def before_run(self, runner): |
| | if self.by_epoch and runner.max_epochs < self.profile_iters: |
| | raise ValueError('self.profile_iters should not be greater than ' |
| | f'{runner.max_epochs}') |
| |
|
| | if not self.by_epoch and runner.max_iters < self.profile_iters: |
| | raise ValueError('self.profile_iters should not be greater than ' |
| | f'{runner.max_iters}') |
| |
|
| | if callable(self.on_trace_ready): |
| | _on_trace_ready = self.on_trace_ready |
| | elif isinstance(self.on_trace_ready, dict): |
| | trace_cfg = self.on_trace_ready.copy() |
| | trace_type = trace_cfg.pop('type') |
| | if trace_type == 'log_trace': |
| |
|
| | def _log_handler(prof): |
| | print(prof.key_averages().table(**trace_cfg)) |
| |
|
| | _on_trace_ready = _log_handler |
| | elif trace_type == 'tb_trace': |
| | try: |
| | import torch_tb_profiler |
| | except ImportError: |
| | raise ImportError('please run "pip install ' |
| | 'torch-tb-profiler" to install ' |
| | 'torch_tb_profiler') |
| | _on_trace_ready = torch.profiler.tensorboard_trace_handler( |
| | **trace_cfg) |
| | else: |
| | raise ValueError('trace_type should be "log_trace" or ' |
| | f'"tb_trace", but got {trace_type}') |
| | elif self.on_trace_ready is None: |
| | _on_trace_ready = None |
| | else: |
| | raise ValueError('on_trace_ready should be handler, dict or None, ' |
| | f'but got {type(self.on_trace_ready)}') |
| |
|
| | if runner.max_epochs > 1: |
| | warnings.warn(f'profiler will profile {runner.max_epochs} epochs ' |
| | 'instead of 1 epoch. Since profiler will slow down ' |
| | 'the training, it is recommended to train 1 epoch ' |
| | 'with ProfilerHook and adjust your setting according' |
| | ' to the profiler summary. During normal training ' |
| | '(epoch > 1), you may disable the ProfilerHook.') |
| |
|
| | self.profiler = torch.profiler.profile( |
| | activities=self.activities, |
| | schedule=self.schedule, |
| | on_trace_ready=_on_trace_ready, |
| | record_shapes=self.record_shapes, |
| | profile_memory=self.profile_memory, |
| | with_stack=self.with_stack, |
| | with_flops=self.with_flops) |
| |
|
| | self.profiler.__enter__() |
| | runner.logger.info('profiler is profiling...') |
| |
|
| | @master_only |
| | def after_train_epoch(self, runner): |
| | if self.by_epoch and runner.epoch == self.profile_iters - 1: |
| | runner.logger.info('profiler may take a few minutes...') |
| | self.profiler.__exit__(None, None, None) |
| | if self.json_trace_path is not None: |
| | self.profiler.export_chrome_trace(self.json_trace_path) |
| |
|
| | @master_only |
| | def after_train_iter(self, runner): |
| | self.profiler.step() |
| | if not self.by_epoch and runner.iter == self.profile_iters - 1: |
| | runner.logger.info('profiler may take a few minutes...') |
| | self.profiler.__exit__(None, None, None) |
| | if self.json_trace_path is not None: |
| | self.profiler.export_chrome_trace(self.json_trace_path) |
| |
|