| | from typing import Any, Dict, List, Optional |
| | from warnings import warn |
| |
|
| | import torch |
| | import torch.cuda |
| | from torch._C._profiler import _ExperimentalConfig |
| |
|
| | from torch.autograd import ( |
| | _disable_profiler, |
| | _enable_profiler, |
| | _kineto_step, |
| | _prepare_profiler, |
| | _ProfilerResult, |
| | _supported_activities, |
| | DeviceType, |
| | kineto_available, |
| | ProfilerActivity, |
| | ProfilerConfig, |
| | ProfilerState, |
| | ) |
| | from torch.autograd.profiler_util import ( |
| | _filter_name, |
| | _filter_stack_entry, |
| | _rewrite_name, |
| | EventList, |
| | FunctionEvent, |
| | MEMORY_EVENT_NAME, |
| | MemRecordsAcc, |
| | OUT_OF_MEMORY_EVENT_NAME, |
| | ) |
| | from torch.futures import Future |
| |
|
| |
|
| | try: |
| | |
| | from contextlib import ContextDecorator |
| | except ImportError: |
| | import functools |
| |
|
| | class ContextDecorator(object): |
| |
|
| | def __enter__(self): |
| | raise NotImplementedError |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | raise NotImplementedError |
| |
|
| | def __call__(self, func): |
| | @functools.wraps(func) |
| | def wrapped(*args, **kwargs): |
| | with self: |
| | return func(*args, **kwargs) |
| |
|
| | return wrapped |
| |
|
| |
|
| | class profile(object): |
| | """Context manager that manages autograd profiler state and holds a summary of results. |
| | Under the hood it just records events of functions being executed in C++ and |
| | exposes those events to Python. You can wrap any code into it and it will |
| | only report runtime of PyTorch functions. |
| | Note: profiler is thread local and is automatically propagated into the async tasks |
| | |
| | Args: |
| | enabled (bool, optional): Setting this to False makes this context manager a no-op. |
| | |
| | use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API. |
| | Adds approximately 4us of overhead to each tensor operation. |
| | |
| | record_shapes (bool, optional): If shapes recording is set, information |
| | about input dimensions will be collected. This allows one to see which |
| | dimensions have been used under the hood and further group by them |
| | using prof.key_averages(group_by_input_shape=True). Please note that |
| | shape recording might skew your profiling data. It is recommended to |
| | use separate runs with and without shape recording to validate the timing. |
| | Most likely the skew will be negligible for bottom most events (in a case |
| | of nested function calls). But for higher level functions the total |
| | self cpu time might be artificially increased because of the shape |
| | collection. |
| | |
| | with_flops (bool, optional): If with_flops is set, the profiler will estimate |
| | the FLOPs (floating point operations) value using the operator's input shape. |
| | This allows one to estimate the hardware performance. Currently, |
| | this option only works for the matrix multiplication and 2D convolution operators. |
| | |
| | profile_memory (bool, optional): track tensor memory allocation/deallocation. |
| | |
| | with_stack (bool, optional): record source information (file and line number) for the ops. |
| | |
| | with_modules (bool): record module hierarchy (including function names) |
| | corresponding to the callstack of the op. e.g. If module A's forward call's |
| | module B's forward which contains an aten::add op, |
| | then aten::add's module hierarchy is A.B |
| | Note that this support exist, at the moment, only for TorchScript models |
| | and not eager mode models. |
| | |
| | use_kineto (bool, optional): experimental, enable profiling with Kineto profiler. |
| | |
| | use_cpu (bool, optional): profile CPU events; setting to ``False`` requires |
| | ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling. |
| | |
| | experimental_config (_ExperimentalConfig) : A set of experimental options |
| | used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed. |
| | |
| | |
| | .. warning: |
| | Enabling memory profiling or source attribution incurs additional profiler |
| | overhead |
| | |
| | .. warning: |
| | This context managers should not be called recursively, i.e. no nested |
| | instances are allowed |
| | |
| | .. warning: |
| | Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_), |
| | one cannot use the profiler with ``use_cuda = True`` to benchmark |
| | DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading, |
| | please use ``use_cuda = False`` or ``num_workers = 0``. |
| | |
| | Example: |
| | >>> # xdoctest: +SKIP |
| | >>> x = torch.randn((1, 1), requires_grad=True) |
| | >>> with torch.autograd.profiler.profile() as prof: |
| | >>> for _ in range(100): # any normal python code, really! |
| | >>> y = x ** 2 |
| | >>> y.backward() |
| | >>> # NOTE: some columns were removed for brevity |
| | >>> print(prof.key_averages().table(sort_by="self_cpu_time_total")) |
| | ----------------------------------- --------------- --------------- --------------- |
| | Name Self CPU total CPU time avg Number of Calls |
| | ----------------------------------- --------------- --------------- --------------- |
| | mul 32.048ms 32.048ms 200 |
| | pow 27.041ms 27.041ms 200 |
| | PowBackward0 9.727ms 55.483ms 100 |
| | torch::autograd::AccumulateGrad 9.148ms 9.148ms 100 |
| | torch::autograd::GraphRoot 691.816us 691.816us 100 |
| | ----------------------------------- --------------- --------------- --------------- |
| | |
| | """ |
| | def __init__( |
| | self, |
| | enabled=True, |
| | *, |
| | use_cuda=False, |
| | record_shapes=False, |
| | with_flops=False, |
| | profile_memory=False, |
| | with_stack=False, |
| | with_modules=False, |
| | use_kineto=False, |
| | use_cpu=True, |
| | experimental_config=None): |
| | self.enabled: bool = enabled |
| | if not self.enabled: |
| | return |
| | self.use_cuda = use_cuda |
| | self.function_events: Optional[EventList] = None |
| | self.entered = False |
| | self.record_shapes = record_shapes |
| | self.with_flops = with_flops |
| | self.record_shapes |= self.with_flops |
| | self.profile_memory = profile_memory |
| | self.with_stack = with_stack |
| | self.with_modules = with_modules |
| | self.use_cpu = use_cpu |
| | if experimental_config is None: |
| | experimental_config = _ExperimentalConfig() |
| | self.experimental_config = experimental_config |
| | self.kineto_results: Optional[_ProfilerResult] = None |
| |
|
| | if not self.use_cpu: |
| | assert use_kineto, \ |
| | "Device-only events supported only with Kineto (use_kineto=True)" |
| |
|
| | if self.use_cuda and not torch.cuda.is_available(): |
| | warn("CUDA is not available, disabling CUDA profiling") |
| | self.use_cuda = False |
| |
|
| | self.kineto_activities = set() |
| | if self.use_cpu: |
| | self.kineto_activities.add(ProfilerActivity.CPU) |
| |
|
| | self.profiler_kind = ProfilerState.KINETO |
| | if self.use_cuda: |
| | if (not use_kineto or ProfilerActivity.CUDA not in |
| | _supported_activities()): |
| | assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True" |
| | self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK |
| | else: |
| | self.kineto_activities.add(ProfilerActivity.CUDA) |
| |
|
| | assert len(self.kineto_activities) > 0, \ |
| | "No activities specified for the profiler" |
| |
|
| |
|
| | def config(self): |
| | return ProfilerConfig( |
| | self.profiler_kind, |
| | self.record_shapes, |
| | self.profile_memory, |
| | self.with_stack, |
| | self.with_flops, |
| | self.with_modules, |
| | self.experimental_config) |
| |
|
| | def __enter__(self): |
| | if not self.enabled: |
| | return |
| | if self.entered: |
| | raise RuntimeError("Profiler context manager is not reentrant") |
| | self._prepare_trace() |
| | self._start_trace() |
| | return self |
| |
|
| | def _prepare_trace(self): |
| | self.entered = True |
| | _prepare_profiler(self.config(), self.kineto_activities) |
| |
|
| | def _start_trace(self): |
| | self.entered = True |
| | _enable_profiler(self.config(), self.kineto_activities) |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | if not self.enabled: |
| | return |
| | if self.use_cuda: |
| | torch.cuda.synchronize() |
| | self.kineto_results = _disable_profiler() |
| | parsed_results = self._parse_kineto_results(self.kineto_results) |
| | self.function_events = EventList( |
| | parsed_results, |
| | use_cuda=self.use_cuda, |
| | profile_memory=self.profile_memory, |
| | with_flops=self.with_flops) |
| | self.function_events._build_tree() |
| | return False |
| |
|
| | def __repr__(self): |
| | if self.function_events is None: |
| | return '<unfinished torch.autograd.profile>' |
| | return repr(self.function_events) |
| |
|
| | def __str__(self): |
| | if self.function_events is None: |
| | return '<unfinished torch.autograd.profile>' |
| | return str(self.function_events) |
| |
|
| | def _check_finish(self): |
| | if self.function_events is None: |
| | raise RuntimeError("Profiler didn't finish running") |
| |
|
| | def table( |
| | self, |
| | sort_by=None, |
| | row_limit=100, |
| | max_src_column_width=75, |
| | max_name_column_width=55, |
| | max_shapes_column_width=80, |
| | header=None, |
| | top_level_events_only=False |
| | ): |
| | self._check_finish() |
| | assert self.function_events is not None |
| | return self.function_events.table( |
| | sort_by=sort_by, |
| | row_limit=row_limit, |
| | max_src_column_width=max_src_column_width, |
| | max_name_column_width=max_name_column_width, |
| | max_shapes_column_width=max_shapes_column_width, |
| | header=header, |
| | top_level_events_only=top_level_events_only |
| | ) |
| | table.__doc__ = EventList.table.__doc__ |
| |
|
| | def export_chrome_trace(self, path): |
| | self._check_finish() |
| | if kineto_available(): |
| | self.kineto_results.save(path) |
| | else: |
| | return self.function_events.export_chrome_trace(path) |
| | export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__ |
| |
|
| | def export_stacks(self, path: str, metric: str = "self_cpu_time_total"): |
| | self._check_finish() |
| | assert self.function_events is not None, "Expected profiling results" |
| | assert self.with_stack, "export_stacks() requires with_stack=True" |
| | return self.function_events.export_stacks(path, metric) |
| |
|
| | def key_averages(self, group_by_input_shape=False, group_by_stack_n=0): |
| | self._check_finish() |
| | assert self.function_events is not None, "Expected profiling results" |
| | return self.function_events.key_averages(group_by_input_shape, group_by_stack_n) |
| | key_averages.__doc__ = EventList.key_averages.__doc__ |
| |
|
| | def total_average(self): |
| | self._check_finish() |
| | assert self.function_events is not None, "Expected profiling results" |
| | return self.function_events.total_average() |
| | total_average.__doc__ = EventList.total_average.__doc__ |
| |
|
| | @property |
| | def self_cpu_time_total(self): |
| | """ Returns total time spent on CPU obtained as a sum of |
| | all self times across all the events. |
| | """ |
| | self._check_finish() |
| | assert self.function_events is not None |
| | return self.function_events.self_cpu_time_total |
| |
|
| | def _parse_kineto_results(self, result): |
| | |
| |
|
| | trace_start_us = result.trace_start_us() |
| | mem_records = [[evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME] |
| | oom_records = [evt for evt in result.events() if evt.name() == OUT_OF_MEMORY_EVENT_NAME] |
| | mem_records_acc = MemRecordsAcc(mem_records) |
| |
|
| | def _cpu_memory_usage(mem_record): |
| | return mem_record.nbytes() if \ |
| | mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \ |
| | else 0 |
| |
|
| | def _cuda_memory_usage(mem_record): |
| | return mem_record.nbytes() if \ |
| | mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \ |
| | else 0 |
| |
|
| | |
| | function_events = [] |
| | cuda_corr_map: Dict[int, List[FunctionEvent]] = {} |
| | max_evt_id = 0 |
| | for kineto_event in result.events(): |
| | if _filter_name(kineto_event.name()): |
| | continue |
| | rel_start_us = kineto_event.start_us() - trace_start_us |
| | rel_end_us = rel_start_us + kineto_event.duration_us() |
| | abs_end_us = kineto_event.start_us() + kineto_event.duration_us() |
| |
|
| | cpu_memory_usage = 0 |
| | cuda_memory_usage = 0 |
| | if kineto_event.device_type() == DeviceType.CPU: |
| | |
| | for mem_record in mem_records_acc.in_interval(kineto_event.start_us(), abs_end_us): |
| | cpu_memory_usage += _cpu_memory_usage(mem_record[0]) |
| | cuda_memory_usage += _cuda_memory_usage(mem_record[0]) |
| | mem_record[1] = True |
| |
|
| | is_async = kineto_event.is_async() or ( |
| | kineto_event.start_thread_id() != kineto_event.end_thread_id() |
| | ) |
| |
|
| | fe = FunctionEvent( |
| | id=kineto_event.correlation_id(), |
| | name=_rewrite_name(name=kineto_event.name(), with_wildcard=True), |
| | trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False), |
| | thread=kineto_event.start_thread_id(), |
| | start_us=rel_start_us, |
| | end_us=rel_end_us, |
| | fwd_thread=kineto_event.fwd_thread_id(), |
| | input_shapes=kineto_event.shapes(), |
| | stack=[entry for entry in kineto_event.stack() if _filter_stack_entry(entry)], |
| | scope=kineto_event.scope(), |
| | cpu_memory_usage=cpu_memory_usage, |
| | cuda_memory_usage=cuda_memory_usage, |
| | is_async=is_async, |
| | sequence_nr=kineto_event.sequence_nr(), |
| | device_type=kineto_event.device_type(), |
| | device_index=kineto_event.device_index(), |
| | flops=kineto_event.flops(), |
| | ) |
| | max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id |
| | if fe.device_type == DeviceType.CPU and not fe.is_async: |
| | |
| | cuda_time = kineto_event.cuda_elapsed_us() |
| | if cuda_time > 0: |
| | fe.append_kernel( |
| | fe.name, |
| | fe.device_index, |
| | cuda_time) |
| | fe.is_legacy = True |
| | function_events.append(fe) |
| | corr_id = kineto_event.linked_correlation_id() |
| | if corr_id > 0: |
| | if corr_id not in cuda_corr_map: |
| | cuda_corr_map[corr_id] = [] |
| | cuda_corr_map[corr_id].append(fe) |
| |
|
| | |
| | for fe in function_events: |
| | if (fe.device_type == DeviceType.CPU and not fe.is_async and |
| | fe.id in cuda_corr_map): |
| | for f_evt in cuda_corr_map[fe.id]: |
| | if f_evt.device_type == DeviceType.CUDA: |
| | fe.append_kernel( |
| | f_evt.name, |
| | f_evt.device_index, |
| | f_evt.time_range.end - f_evt.time_range.start) |
| | elif f_evt.device_type == DeviceType.CPU: |
| | |
| | |
| | |
| | f_evt.thread = fe.thread |
| |
|
| |
|
| | def createFunctionEventForMemoryEvents(evt): |
| | rel_start_us = evt.start_us() - trace_start_us |
| | fe = FunctionEvent( |
| | id=max_evt_id, |
| | name=evt.name(), |
| | trace_name=None, |
| | thread=evt.start_thread_id(), |
| | start_us=rel_start_us, |
| | end_us=rel_start_us, |
| | fwd_thread=evt.start_thread_id(), |
| | input_shapes=[], |
| | stack=[], |
| | scope=0, |
| | cpu_memory_usage=_cpu_memory_usage(evt), |
| | cuda_memory_usage=_cuda_memory_usage(evt), |
| | is_async=False, |
| | sequence_nr=-1, |
| | device_type=DeviceType.CPU, |
| | device_index=0, |
| | ) |
| | return fe |
| |
|
| | |
| | for mem_record in mem_records: |
| | if not mem_record[1]: |
| | max_evt_id += 1 |
| | fe = createFunctionEventForMemoryEvents(mem_record[0]) |
| | function_events.append(fe) |
| |
|
| | for oom_record in oom_records: |
| | max_evt_id += 1 |
| | fe = createFunctionEventForMemoryEvents(oom_record) |
| | function_events.append(fe) |
| |
|
| | function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) |
| | return function_events |
| |
|
| |
|
| | class record_function(ContextDecorator): |
| | """Context manager/function decorator that adds a label to a block of |
| | Python code (or function) when running autograd profiler. It is |
| | useful when tracing the code profile. |
| | |
| | Args: |
| | name (str): Label assigned to the block of code. |
| | node_id (int): ID of node, for distributed profiling. Unset in |
| | non-distributed cases. |
| | |
| | Example: |
| | >>> x = torch.randn((1, 1), requires_grad=True) |
| | >>> with torch.autograd.profiler.profile() as prof: |
| | ... y = x ** 2 |
| | ... with torch.autograd.profiler.record_function("label-z"): # label the block |
| | ... z = y ** 3 |
| | ... y.backward() |
| | ... |
| | >>> # xdoctest: +IGNORE_WANT |
| | >>> # NOTE: some columns were removed for brevity |
| | >>> print(prof.key_averages().table(sort_by="self_cpu_time_total")) |
| | ----------------------------------- --------------- --------------- --------------- |
| | Name Self CPU total % CPU time avg Number of Calls |
| | ----------------------------------- --------------- --------------- --------------- |
| | pow 60.77% 47.470us 3 |
| | mul 21.73% 25.465us 2 |
| | PowBackward0 12.03% 121.891us 1 |
| | torch::autograd::AccumulateGrad 2.70% 6.324us 1 |
| | label-z 2.13% 12.421us 1 |
| | torch::autograd::GraphRoot 0.64% 1.503us 1 |
| | ----------------------------------- --------------- --------------- --------------- |
| | Self CPU time total: 234.344us |
| | CUDA time total: 0.000us |
| | |
| | """ |
| | def __init__(self, name: str, args: Optional[str] = None): |
| | self.name: str = name |
| | self.args: Optional[str] = args |
| | |
| | self.run_callbacks_on_exit: bool = True |
| | |
| | |
| | self.handle: torch.Tensor = torch.zeros(1) |
| |
|
| | def __enter__(self): |
| | self.handle = torch.ops.profiler._record_function_enter(self.name, self.args) |
| | return self |
| |
|
| | def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any): |
| | if self.run_callbacks_on_exit: |
| | torch.ops.profiler._record_function_exit(self.handle) |
| |
|
| | def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]: |
| | """ |
| | _call_end_callbacks_on_future is meant to be used for profiling async |
| | calls that return a future. Calling this function will extend recording |
| | beyond this scope, until the future is satisfied. It is useful for profiling |
| | the end to end time of asynchronous calls. This function should only be called |
| | once to attach the callback onto the future, and will throw if called multiple |
| | times. |
| | |
| | Args: |
| | fut: (torch._C.Future): future for which to schedule |
| | callback for. |
| | |
| | Returns: |
| | A future that completes with the value of the passed in future when |
| | the profiling callbacks have ran. |
| | |
| | """ |
| | |
| | if not self.run_callbacks_on_exit: |
| | raise RuntimeError("_call_end_callbacks_on_future can only be called once.") |
| |
|
| | |
| | |
| | self.run_callbacks_on_exit = False |
| | profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(self.handle, fut) |
| | return profiled_future |
| |
|
| |
|
| | class emit_itt(object): |
| | """Context manager that makes every autograd operation emit an ITT range. |
| | |
| | It is useful when running the program under Intel(R) VTune Profiler:: |
| | |
| | vtune <--vtune_flags> <regular command here> |
| | |
| | The Instrumentation and Tracing Technology (ITT) API enables your application to generate and |
| | control the collection of trace data during its execution across different Intel tools. |
| | This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager, |
| | you will be able to see labled ranges in Intel(R) VTune Profiler GUI. |
| | |
| | .. warning: |
| | This context manager should not be called recursively, i.e. at most one |
| | instance should be enabled at any given time. |
| | |
| | Args: |
| | enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op. |
| | Default: ``True``. |
| | record_shapes (bool, optional): If ``record_shapes=True``, the itt range wrapping |
| | each autograd op will append information about the sizes of Tensor arguments received |
| | by that op, in the following format: |
| | ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]`` |
| | Non-tensor arguments will be represented by ``[]``. |
| | Arguments will be listed in the order they are received by the backend op. |
| | Please note that this order may not match the order in which those arguments were passed |
| | on the Python side. Also note that shape recording may increase the overhead of itt range creation. |
| | Default: ``False`` |
| | |
| | Example: |
| | >>> # xdoctest: +SKIP("Undefined variables") |
| | >>> with torch.autograd.profiler.emit_itt(): |
| | ... model(x) |
| | |
| | """ |
| | def __init__(self, enabled=True, record_shapes=False): |
| | self.enabled = enabled |
| | self.entered = False |
| | self.record_shapes = record_shapes |
| |
|
| | def __enter__(self): |
| | if not self.enabled: |
| | return |
| | if self.entered: |
| | raise RuntimeError("ITT annotation context manager is not reentrant") |
| | self.entered = True |
| | _enable_profiler( |
| | ProfilerConfig( |
| | ProfilerState.ITT, |
| | self.record_shapes, |
| | False, |
| | False, |
| | False, |
| | False, |
| | _ExperimentalConfig()), |
| | set() |
| | ) |
| | return self |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | if not self.enabled: |
| | return |
| | _disable_profiler() |
| | return False |
| |
|
| |
|
| | class emit_nvtx(object): |
| | """Context manager that makes every autograd operation emit an NVTX range. |
| | |
| | It is useful when running the program under nvprof:: |
| | |
| | nvprof --profile-from-start off -o trace_name.prof -- <regular command here> |
| | |
| | Unfortunately, there's no way to force nvprof to flush the data it collected |
| | to disk, so for CUDA profiling one has to use this context manager to annotate |
| | nvprof traces and wait for the process to exit before inspecting them. |
| | Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or |
| | :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection |
| | e.g. in Python REPL. |
| | |
| | .. warning: |
| | This context manager should not be called recursively, i.e. at most one |
| | instance should be enabled at any given time. |
| | |
| | Args: |
| | enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op. |
| | Default: ``True``. |
| | record_shapes (bool, optional): If ``record_shapes=True``, the nvtx range wrapping |
| | each autograd op will append information about the sizes of Tensor arguments received |
| | by that op, in the following format: |
| | ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]`` |
| | Non-tensor arguments will be represented by ``[]``. |
| | Arguments will be listed in the order they are received by the backend op. |
| | Please note that this order may not match the order in which those arguments were passed |
| | on the Python side. Also note that shape recording may increase the overhead of nvtx range creation. |
| | Default: ``False`` |
| | |
| | Example: |
| | >>> # xdoctest: +SKIP("undefined variables") |
| | >>> with torch.cuda.profiler.profile(): |
| | ... model(x) # Warmup CUDA memory allocator and profiler |
| | ... with torch.autograd.profiler.emit_nvtx(): |
| | ... model(x) |
| | |
| | **Forward-backward correlation** |
| | |
| | When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler, |
| | correlating each backward-pass op with the corresponding forward-pass op can be difficult. |
| | To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it |
| | generates. |
| | |
| | During the forward pass, each function range is decorated with ``seq=<N>``. ``seq`` is a running |
| | counter, incremented each time a new backward Function object is created and stashed for backward. |
| | Thus, the ``seq=<N>`` annotation associated with each forward function range tells you that |
| | if a backward Function object is created by this forward function, |
| | the backward object will receive sequence number N. |
| | During the backward pass, the top-level range wrapping each C++ backward Function's |
| | ``apply()`` call is decorated with ``stashed seq=<M>``. ``M`` is the sequence number that |
| | the backward object was created with. By comparing ``stashed seq`` numbers in backward with ``seq`` |
| | numbers in forward, you can track down which forward op created each backward Function. |
| | |
| | Any functions executed during the backward pass are also decorated with ``seq=<N>``. During |
| | default backward (with ``create_graph=False``) this information is irrelevant, and in fact, |
| | ``N`` may simply be 0 for all such functions. Only the top-level ranges associated with |
| | backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function |
| | objects with the earlier forward pass. |
| | |
| | **Double-backward** |
| | |
| | If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words, |
| | if you are setting up for a double-backward), each function's execution during backward |
| | is given a nonzero, useful ``seq=<N>``. Those functions may themselves create Function objects |
| | to be executed later during double-backward, just as the original functions in the forward pass did. |
| | The relationship between backward and double-backward is conceptually the same as the relationship |
| | between forward and backward: The functions still emit current-sequence-number-tagged ranges, |
| | the Function objects they create still stash those sequence numbers, and during the eventual |
| | double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq`` |
| | numbers, which can be compared to `seq` numbers from the backward pass. |
| | |
| | .. warning: |
| | The sequence number is thread-local, and some forward functions don't create an associated |
| | backward Function object (instead delegating that to sub-functions further down the call chain). |
| | For these reasons, the correspondence of stashed sequence numbers in |
| | backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is |
| | not guaranteed to be 1 to 1. The sequence numbers alone may not be enough to fully |
| | disambiguate which forward function created which |
| | backward Function object. You may need to make a judgment based on analytic knowledge of what |
| | the expected correspondence should be. |
| | """ |
| | def __init__(self, enabled=True, record_shapes=False): |
| | self.enabled = enabled |
| | self.entered = False |
| | self.record_shapes = record_shapes |
| |
|
| | def __enter__(self): |
| | if not self.enabled: |
| | return |
| | if self.entered: |
| | raise RuntimeError("NVTX annotation context manager is not reentrant") |
| | self.entered = True |
| | torch.cuda.synchronize() |
| | _enable_profiler( |
| | ProfilerConfig( |
| | ProfilerState.NVTX, |
| | self.record_shapes, |
| | False, |
| | False, |
| | False, |
| | False, |
| | _ExperimentalConfig()), |
| | set() |
| | ) |
| | return self |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | if not self.enabled: |
| | return |
| | torch.cuda.synchronize() |
| | _disable_profiler() |
| | return False |
| |
|
| |
|
| | def load_nvprof(path): |
| | """Opens an nvprof trace file and parses autograd annotations. |
| | |
| | Args: |
| | path (str): path to nvprof trace |
| | """ |
| | return EventList(parse_nvprof_trace(path)) |
| |
|
| |
|
| | class EnforceUnique(object): |
| | """Raises an error if a key is seen more than once.""" |
| | def __init__(self): |
| | self.seen = set() |
| |
|
| | def see(self, *key): |
| | if key in self.seen: |
| | raise RuntimeError('duplicate key: ' + str(key)) |
| | self.seen.add(key) |
| |
|
| |
|
| | def parse_nvprof_trace(path): |
| | import sqlite3 |
| | conn = sqlite3.connect(path) |
| | conn.row_factory = sqlite3.Row |
| |
|
| | |
| | strings = {} |
| | for r in conn.execute("SELECT _id_ as id, value FROM StringTable"): |
| | strings[r["id"]] = torch._C._demangle(r["value"]) |
| |
|
| | |
| | marker_query = """ |
| | SELECT |
| | start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time |
| | FROM |
| | CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end |
| | ON start.id = end.id |
| | WHERE |
| | start.name != 0 AND end.name = 0 |
| | """ |
| | functions = [] |
| | functions_map = {} |
| | unique = EnforceUnique() |
| | for row in conn.execute(marker_query): |
| | unique.see(row['marker_id']) |
| | evt = FunctionEvent(id=row['marker_id'], |
| | node_id=0, |
| | |
| | name=strings[row['name']], |
| | start_us=row['start_time'], |
| | end_us=row['end_time'], |
| | thread=0) |
| | functions.append(evt) |
| | functions_map[evt.id] = evt |
| |
|
| | |
| | kernel_query = """ |
| | SELECT |
| | start.id AS marker_id, start.name, start.timestamp, end.timestamp, |
| | runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end, |
| | kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name |
| | FROM |
| | CUPTI_ACTIVITY_KIND_MARKER AS start |
| | INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end |
| | ON start.id = end.id |
| | INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime |
| | ON (start.timestamp < runtime.start AND runtime.end < end.timestamp) |
| | INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel |
| | ON kernel.correlationId = runtime.correlationId |
| | """ |
| | unique = EnforceUnique() |
| | for row in conn.execute(kernel_query): |
| | unique.see(row['marker_id'], row['runtime_id']) |
| | |
| | assert (row['cbid'] == 211) |
| | evt = functions_map[row['marker_id']] |
| | evt.append_kernel(row['kernel_name'], |
| | 0, |
| | row['kernel_end'] - row['kernel_start']) |
| |
|
| | functions.sort(key=lambda evt: evt.time_range.start) |
| | return functions |
| |
|
| |
|
| | def kineto_step(): |
| | """ Notify kineto so it is aware of iteration boundaries for asynchronous |
| | trace requests. |
| | """ |
| | _kineto_step() |
| |
|