diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..701035b7b2453910a45df00470c521178b2a6c7b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..631f443a5e7b891e8071b7eb086251f25fdf70ba Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00a65c53fca2def72ca9c16b610f7dda404eeed3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..722ada8e038f3905c8130265c1468e8b1f6327d7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..575912000bef53c7862e759ddf94dfe24e176e1a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5517ebbb453d3ccb03eb8b0fa83b1198c56bd469 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8bb6f72860188fc180630fb42f1c03c298e46e1 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d08365e63a74ea1b351a5635078ee211861fa3f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..88ff04d86648806a21b180ae79e6a58bf5b22685 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py @@ -0,0 +1,144 @@ +import collections +import functools + +import torch + +try: + import numpy as np + + HAS_NUMPY = True +except ModuleNotFoundError: + np = None # type: ignore[assignment] +from typing import Any + +__all__ = ["autocast", "custom_fwd", "custom_bwd"] + + +class autocast(torch.amp.autocast_mode.autocast): + r"""See :class:`torch.autocast`. + + ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)`` + """ + + def __init__( + self, + enabled: bool = True, + dtype: torch.dtype = torch.float16, + cache_enabled: bool = True, + ): + if torch._jit_internal.is_scripting(): + self._enabled = enabled + self.device = "cuda" + self.fast_dtype = dtype + return + super().__init__( + "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled + ) + + def __enter__(self): + if torch._jit_internal.is_scripting(): + return self + return super().__enter__() + + # TODO: discuss a unified TorchScript-friendly API for autocast + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): # type: ignore[override] + if torch._jit_internal.is_scripting(): + return + return super().__exit__(exc_type, exc_val, exc_tb) + + def __call__(self, func): + if torch._jit_internal.is_scripting(): + return func + return super().__call__(func) + + +# Casts Tensors and containers of Tensors. Special-cases passthroughs for strings and np.ndarrays, which +# may be falsely detected as "Iterables." +def _cast(value, dtype): + if isinstance(value, torch.Tensor): + is_eligible = ( + value.is_floating_point() + and value.is_cuda + and (value.dtype is not torch.float64) + ) + return value.to(dtype) if is_eligible else value + elif isinstance(value, (str, bytes)): + return value + elif HAS_NUMPY and isinstance(value, np.ndarray): + return value + elif isinstance(value, collections.abc.Mapping): + return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()} + elif isinstance(value, collections.abc.Iterable): + iterable = (_cast(v, dtype) for v in value) + if isinstance(value, (list, tuple)): + return type(value)(iterable) + else: + return iterable + else: + return value + + +# custom_fwd is a decorator that may or may not be used with arguments, following +# https://github.com/dabeaz/python-cookbook/tree/master/src/9/defining_a_decorator_that_takes_an_optional_argument. +# this works: +# @custom_fwd +# def forward(...): +# this also works: +# @custom_fwd(cast_inputs=torch.float) +# def forward(...): +def custom_fwd(fwd=None, *, cast_inputs=None): + """ + Create a helper decorator for ``forward`` methods of custom autograd functions. + + Autograd functions are subclasses of :class:`torch.autograd.Function`. + See the :ref:`example page` for more detail. + + Args: + cast_inputs (:class:`torch.dtype` or None, optional, default=None): If not ``None``, + when ``forward`` runs in an autocast-enabled region, casts incoming + floating-point CUDA Tensors to the target dtype (non-floating-point Tensors are not affected), + then executes ``forward`` with autocast disabled. + If ``None``, ``forward``'s internal ops execute with the current autocast state. + + .. note:: + If the decorated ``forward`` is called outside an autocast-enabled region, + :func:`custom_fwd` is a no-op and ``cast_inputs`` has no effect. + """ + if fwd is None: + return functools.partial(custom_fwd, cast_inputs=cast_inputs) + + @functools.wraps(fwd) + def decorate_fwd(*args, **kwargs): + args[0]._dtype = torch.get_autocast_gpu_dtype() + if cast_inputs is None: + args[0]._fwd_used_autocast = torch.is_autocast_enabled() + return fwd(*args, **kwargs) + else: + autocast_context = torch.is_autocast_enabled() + args[0]._fwd_used_autocast = False + if autocast_context: + with autocast(enabled=False): + return fwd(*_cast(args, cast_inputs), **_cast(kwargs, cast_inputs)) + else: + return fwd(*args, **kwargs) + + return decorate_fwd + + +# Autograd ensures incoming gradients are the same type as forward outputs. Allowing a separate +# cast_inputs argument on custom_bwd is unnecessary and could cause errors if it doesn't match +# cast_inputs supplied to custom_fwd. +def custom_bwd(bwd): + """Create a helper decorator for backward methods of custom autograd functions. + + Autograd functions are subclasses of :class:`torch.autograd.Function`. + Ensures that ``backward`` executes with the same autocast state as ``forward``. + See the :ref:`example page` for more detail. + """ + + @functools.wraps(bwd) + def decorate_bwd(*args, **kwargs): + with autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype): + return bwd(*args, **kwargs) + + return decorate_bwd diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/error.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/error.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py new file mode 100644 index 0000000000000000000000000000000000000000..b3bfbab6ad1617386d77127c24450f46302e0ed6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py @@ -0,0 +1,479 @@ +import gc +from typing import Optional + +import torch +from torch.utils import _pytree +from .._utils import _dummy_type + +if not hasattr(torch._C, "_CudaStreamBase"): + # Define dummy base classes + torch._C.__dict__["_CUDAGraph"] = _dummy_type("_CUDAGraph") + torch._C.__dict__["_graph_pool_handle"] = _dummy_type("_graph_pool_handle") + torch._C.__dict__["_cuda_isCurrentStreamCapturing"] = _dummy_type( + "_cuda_isCurrentStreamCapturing" + ) + +from torch._C import ( # noqa: F401 + _cuda_isCurrentStreamCapturing, + _CUDAGraph, + _graph_pool_handle, +) + + +def is_current_stream_capturing(): + r"""Return True if CUDA graph capture is underway on the current CUDA stream, False otherwise. + + If a CUDA context does not exist on the current device, returns False without initializing the context. + """ + return _cuda_isCurrentStreamCapturing() + + +# Python shim helps Sphinx process docstrings more reliably. +def graph_pool_handle(): + r"""Return an opaque token representing the id of a graph memory pool. + + See :ref:`Graph memory management`. + + .. warning:: + This API is in beta and may change in future releases. + """ + return _graph_pool_handle() + + +# Python shim helps Sphinx process docstrings more reliably. +class CUDAGraph(torch._C._CUDAGraph): + r"""Wrapper around a CUDA graph. + + .. warning:: + This API is in beta and may change in future releases. + """ + + def __new__(cls): + return super().__new__(cls) + + def capture_begin(self, pool=None, capture_error_mode="global"): + r"""Begin capturing CUDA work on the current stream. + + Typically, you shouldn't call ``capture_begin`` yourself. + Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`, + which call ``capture_begin`` internally. + + Arguments: + pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or + :meth:`other_Graph_instance.pool()`) that hints this graph may share memory + with the indicated pool. See :ref:`Graph memory management`. + capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream. + Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc, + may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for + actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting + unless you're familiar with `cudaStreamCaptureMode `_ + """ # noqa: B950 + super().capture_begin(pool=pool, capture_error_mode=capture_error_mode) + + def capture_end(self): + r"""End CUDA graph capture on the current stream. + + After ``capture_end``, ``replay`` may be called on this instance. + + Typically, you shouldn't call ``capture_end`` yourself. + Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`, + which call ``capture_end`` internally. + """ + super().capture_end() + + def replay(self): + r"""Replay the CUDA work captured by this graph.""" + super().replay() + + def reset(self): + r"""Delete the graph currently held by this instance.""" + super().reset() + + def pool(self): + r"""Return an opaque token representing the id of this graph's memory pool. + + This id can optionally be passed to another graph's ``capture_begin``, + which hints the other graph may share the same memory pool. + """ + return super().pool() + + def enable_debug_mode(self): + r"""Enable debugging mode for CUDAGraph.debug_dump.""" + return super().enable_debug_mode() + + def debug_dump(self, debug_path): + r""" + Arguments: + debug_path (required): Path to dump the graph to. + + Calls a debugging function to dump the graph if the debugging is + enabled via CUDAGraph.enable_debug_mode() + """ + return super().debug_dump(debug_path) + + +class graph: + r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay. + + See :ref:`CUDA Graphs ` for a general introduction, + detailed use, and constraints. + + Arguments: + cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture. + pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or + :meth:`other_Graph_instance.pool()`) hinting this graph's capture + may share memory from the specified pool. See :ref:`Graph memory management`. + stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context. + If not supplied, ``graph`` sets its own internal side stream as the current stream in the context. + capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream. + Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc, + may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for + actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting + unless you're familiar with `cudaStreamCaptureMode `_ + + .. note:: + For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture + used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture. + + .. warning:: + This API is in beta and may change in future releases. + + .. _cudaStreamCaptureMode: + https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85 + """ # noqa: B950 + + default_capture_stream: Optional["torch.cuda.Stream"] = None + + def __init__( + self, + cuda_graph, + pool=None, + stream=None, + capture_error_mode: str = "global", + ): + # Lazy-init of default_capture_stream helps avoid circular-import errors. + # Not thread safe, but graphs already have the general (explicitly documented) + # restriction that only one capture may be underway at a time in the process. + if self.__class__.default_capture_stream is None: + self.__class__.default_capture_stream = torch.cuda.Stream() + + self.pool = () if pool is None else (pool,) + self.capture_stream = ( + stream if stream is not None else self.__class__.default_capture_stream + ) + assert self.capture_stream is not None + self.stream_ctx = torch.cuda.stream(self.capture_stream) + self.cuda_graph = cuda_graph + self.capture_error_mode = capture_error_mode + + def __enter__(self): + # Free as much memory as we can for the graph + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + + # Stackoverflow seems comfortable with this pattern + # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487 + self.stream_ctx.__enter__() + + self.cuda_graph.capture_begin( + *self.pool, capture_error_mode=self.capture_error_mode + ) + + def __exit__(self, exc_type, exc_value, traceback): + self.cuda_graph.capture_end() + self.stream_ctx.__exit__(exc_type, exc_value, traceback) + # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__() + + +def make_graphed_callables( + callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None +): + r"""Accept callables (functions or :class:`nn.Module`\ s) and returns graphed versions. + + Each graphed callable's forward pass runs its source callable's + forward CUDA work as a CUDA graph inside a single autograd node. + + The graphed callable's forward pass also appends + a backward node to the autograd graph. During backward, this node runs the + callable's backward work as a CUDA graph. + + Therefore, each graphed callable should be a drop-in replacement for its source callable + in an autograd-enabled training loop. + + See :ref:`Partial-network capture` for detailed use and constraints. + + If you pass a tuple of several callables, their captures will use the same memory pool. + See :ref:`Graph memory management` for when this is appropriate. + + Arguments: + callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph. + See :ref:`Graph memory management` for when passing a tuple of callables + is appropriate. If you pass a tuple of callables, their order in the tuple must be the same order + they'll run in the live workload. + sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable. + If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors. + If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors. + num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs + 11 iterations for warm up. Default: ``3``. + allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs + (and therefore their grad is always zero) is an error. Defaults to False. + pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or + :meth:`other_Graph_instance.pool()`) that hints this graph may share memory + with the indicated pool. See :ref:`Graph memory management`. + .. note:: + The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state + that's expected for the corresponding real input in the training loop. + + .. warning:: + This API is in beta and may change in future releases. + + .. warning:: + ``sample_args`` for each callable must contain only Tensors. Other types are not allowed. + + .. warning:: + Returned callables do not support higher order differentiation (e.g., double backward). + + .. warning:: + In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters + may be trainable. Buffers must have ``requires_grad=False``. + + .. warning:: + After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`, + you may not add or remove any of that Module's parameters or buffers. + + .. warning:: + :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks + registered on them at the time they are passed. However, registering hooks on modules *after* passing them + through :func:`~torch.cuda.make_graphed_callables` is allowed. + + .. warning:: + When running a graphed callable, you must pass its arguments in the same order and format + they appeared in that callable's ``sample_args``. + + .. warning:: + The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled + caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`. + """ + if torch.is_autocast_enabled() and torch.is_autocast_cache_enabled(): + raise RuntimeError( + "make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`." + ) + + just_one_callable = False + + if not isinstance(callables, tuple): + just_one_callable = True + callables = (callables,) + sample_args = (sample_args,) + + flatten_sample_args = [] + + for c, args in zip(callables, sample_args): + if isinstance(c, torch.nn.Module): + assert ( + len(c._backward_hooks) == 0 + and len(c._forward_hooks) == 0 + and len(c._forward_pre_hooks) == 0 + ), ( + "Modules must not have hooks registered at the time they are passed. However, registering hooks " + + "on modules after passing them through make_graphed_callables is allowed." + ) + assert all(b.requires_grad is False for b in c.buffers()), ( + "In any :class:`~torch.nn.Module` passed to " + + ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have " + + "``requires_grad=False``." + ) + flatten_arg = _pytree.arg_tree_leaves(*args) + flatten_sample_args.append(tuple(flatten_arg)) + assert all(isinstance(arg, torch.Tensor) for arg in flatten_arg), ( + "In the beta API, sample_args " + + "for each callable must contain only Tensors. Other types are not allowed." + ) + + # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly + # passes to forward (ie, its sample_args) AND the module's parameter attributes. + per_callable_len_user_args = [len(args) for args in flatten_sample_args] + per_callable_module_params = [ + tuple(c.parameters()) if isinstance(c, torch.nn.Module) else () + for c in callables + ] + per_callable_static_input_surfaces = [ + flatten_sample_args[i] + per_callable_module_params[i] + for i in range(len(callables)) + ] + + fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))] + bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))] + + mempool = graph_pool_handle() if pool is None else pool + + # Warmup + # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work + # from ending up in any captures. + torch.cuda.synchronize() + with torch.cuda.stream(torch.cuda.Stream()): + for func, args, static_input_surface in zip( + callables, sample_args, per_callable_static_input_surfaces + ): + for _ in range(num_warmup_iters): + outputs = _pytree.tree_leaves(func(*args)) + grad_inputs = torch.autograd.grad( + outputs=tuple(o for o in outputs if o.requires_grad), + inputs=tuple(i for i in static_input_surface if i.requires_grad), + grad_outputs=tuple( + torch.empty_like(o) for o in outputs if o.requires_grad + ), + only_inputs=True, + allow_unused=allow_unused_input, + ) + del outputs, grad_inputs # type: ignore[possibly-undefined] + torch.cuda.synchronize() + + # All captures here share a mempool. To avoid replays corrupting each other's memory, + # the safest approach is to capture all passes in the same order they'll run: + # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1. + + # Capture forward graphs + per_callable_static_outputs = [] + per_callable_output_unflatten_spec = [] + for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs): + with torch.cuda.graph(fwd_graph, pool=mempool): + outputs = func(*args) + + flatten_outputs, spec = _pytree.tree_flatten(outputs) + per_callable_static_outputs.append(tuple(flatten_outputs)) + per_callable_output_unflatten_spec.append(spec) + + # Capture backward graphs in reverse order + per_callable_static_grad_outputs = [] + per_callable_static_grad_inputs = [] + for static_input_surface, static_outputs, bwd_graph, module_params in zip( + reversed(per_callable_static_input_surfaces), + reversed(per_callable_static_outputs), + reversed(bwd_graphs), + reversed(per_callable_module_params), + ): + # For now, assumes all static_outputs require grad + # assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad." + static_grad_outputs = tuple( + torch.empty_like(o) if o.requires_grad else None for o in static_outputs + ) + + with torch.cuda.graph(bwd_graph, pool=mempool): + grad_inputs = torch.autograd.grad( + outputs=tuple(o for o in static_outputs if o.requires_grad), + inputs=tuple(i for i in static_input_surface if i.requires_grad), + grad_outputs=tuple(o for o in static_grad_outputs if o is not None), + only_inputs=True, + allow_unused=allow_unused_input, + ) + + # Constructs a tuple suitable for returning from Graphed.backward: + # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad. + # I couldn't think of a slick one-liner for this pattern. + static_grad_inputs = [] + grad_idx = 0 + for arg in static_input_surface: + if arg.requires_grad: + static_grad_inputs.append(grad_inputs[grad_idx]) + grad_idx += 1 + else: + static_grad_inputs.append(None) # type: ignore[arg-type] + static_grad_inputs = tuple(static_grad_inputs) # type: ignore[assignment] + + per_callable_static_grad_outputs.append(static_grad_outputs) + per_callable_static_grad_inputs.append(static_grad_inputs) + + # Reverses the most recent two lists + per_callable_static_grad_outputs.reverse() + per_callable_static_grad_inputs.reverse() + # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable. + + def make_graphed_autograd_function( + fwd_graph, + bwd_graph, + module_params, + len_user_args, + output_unflatten_spec, + static_input_surface, + static_outputs, + static_grad_outputs, + static_grad_inputs, + ): + class Graphed(torch.autograd.Function): + @staticmethod + def forward(ctx, *inputs): + # At this stage, only the user args may (potentially) be new tensors. + for i in range(len_user_args): + if static_input_surface[i].data_ptr() != inputs[i].data_ptr(): + static_input_surface[i].copy_(inputs[i]) + fwd_graph.replay() + assert isinstance(static_outputs, tuple) + return tuple(o.detach() for o in static_outputs) + + @staticmethod + @torch.autograd.function.once_differentiable + def backward(ctx, *grads): + assert len(grads) == len(static_grad_outputs) + for g, grad in zip(static_grad_outputs, grads): + if g is not None: + # don't copy if autograd gods have been kind and the + # incoming grad is already in the right place + if g.data_ptr() != grad.data_ptr(): + g.copy_(grad) + bwd_graph.replay() + + # Input args that didn't require grad expect a None gradient. + assert isinstance(static_grad_inputs, tuple) + return tuple( + b.detach() if b is not None else b for b in static_grad_inputs + ) + + def functionalized(*user_args): + # Runs the autograd function with inputs == all inputs to the graph that might require grad + # (explicit user args + module parameters) + # Assumes module params didn't change since capture. + flatten_user_args = _pytree.arg_tree_leaves(*user_args) + out = Graphed.apply(*(tuple(flatten_user_args) + module_params)) + return _pytree.tree_unflatten(out, output_unflatten_spec) + + return functionalized + + # Put together the final graphed callables + ret = [] + for i, func in enumerate(callables): + graphed = make_graphed_autograd_function( + fwd_graphs[i], + bwd_graphs[i], + per_callable_module_params[i], + per_callable_len_user_args[i], + per_callable_output_unflatten_spec[i], + per_callable_static_input_surfaces[i], + per_callable_static_outputs[i], + per_callable_static_grad_outputs[i], + per_callable_static_grad_inputs[i], + ) + + if isinstance(func, torch.nn.Module): + + def make_graphed_forward(func, graph_training_state, graphed, orig_fwd): + def new_fwd(*user_args): + # If the module's training-or-eval state matches what we graphed, + # run the graph, otherwise run the original forward method + if func.training == graph_training_state: + return graphed(*user_args) + else: + return orig_fwd(*user_args) + + return new_fwd + + func.forward = make_graphed_forward(func, func.training, graphed, func.forward) # type: ignore[assignment] + ret.append(func) + else: + ret.append(graphed) + + if just_one_callable: + return ret[0] + + return tuple(ret) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py new file mode 100644 index 0000000000000000000000000000000000000000..25d25482419e635612855ed402fd02ef58709417 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py @@ -0,0 +1,185 @@ +import re +from typing import Callable, List + +import torch +from torch import Tensor + +__all__: List[str] = [] + + +class _CodeParser: + def __init__(self, code_string: str): + optional_ws = r"\s*" + required_ws = r"\s+" + template_params = r"(?P\<.+\>)" + return_type = r"(?P\w+)" + function_name = r"(?P\w+)" + function_params = r"(?P\(.+\))" + function_body = r"(?P\{.+\})" + + pattern = ( + optional_ws + + "template" + + optional_ws + + template_params + + optional_ws + + return_type + + required_ws + + function_name + + optional_ws + + function_params + + optional_ws + + function_body + + optional_ws + ) + + result = re.match( + pattern, code_string, re.DOTALL + ) # DOTALL for matching multiline + + if result is None: + raise Exception( + f"Couldn't parse code, please check correctness:\n {code_string}" + ) + + self.template_params = result["template_params"] + self.return_type = result["return_type"] + self.function_name = result["function_name"] + self.function_params = result["function_params"] + self.function_body = result["function_body"] + + +class _JittedFunction: + def __init__( + self, code_string: str, return_by_ref: bool, num_outputs: int, **kwargs + ): + self.code_string = code_string + + assert ( + return_by_ref or num_outputs == 1 + ), "Return by value only works for single output. " + self.return_by_ref = return_by_ref + self.num_outputs = num_outputs + + parsed_code = _CodeParser(code_string) + self.kernel_name = parsed_code.function_name + + self.kwargs_dict = kwargs + self.is_cuda_available = torch.cuda.is_available() + + def __call__(self, *tensors: Tensor, **kwargs): + # Jiterator follow torch.cuda's lazy initialization behavior + # Defer checking cuda's availability at the function invocation time + assert ( + self.is_cuda_available + ), "Jiterator is only supported on CUDA and ROCm GPUs, none are available." + + assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs." + + expanded_kwargs = self.kwargs_dict.copy() + for key, value in kwargs.items(): + if key in self.kwargs_dict: + expanded_kwargs[key] = value + else: + raise KeyError(f"{key} is not declared in function definition") + + return torch._C._cuda_jiterator_compile_and_launch_kernel( + self.code_string, + self.kernel_name, + self.return_by_ref, + self.num_outputs, + tensors, + expanded_kwargs, + ) + + +def _create_jit_fn(code_string: str, **kwargs) -> Callable: + """ + Create a jiterator-generated cuda kernel for an elementwise op. + + The code string has to be a valid CUDA function that describes the computation for a single element. The code + string has to follow the c++ template pattern, as shown in the example below. This function will be inlined + into elementwise kernel template, and compiled on the fly. Compiled kernel will be cached in memory, as well as + local temp dir. + + Jiterator-generated kernels accepts noncontiguous tensors, and supports broadcasting and type promotion. + + Args: + code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return by value. + kwargs (Dict, optional): Keyword arguments for generated function + + Example:: + + code_string = "template T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }" + jitted_fn = create_jit_fn(code_string, alpha=1.0) + a = torch.rand(3, device='cuda') + b = torch.rand(3, device='cuda') + # invoke jitted function like a regular python function + result = jitted_fn(a, b, alpha=3.14) + + code_string also allows multiple function definitions, and the last function will be treated as the entry function. + + Example:: + + code_string = "template T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }" + code_string += "template T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }" + jitted_fn = create_jit_fn(code_string, val=0.0) + a = torch.rand(3, device='cuda') + b = torch.rand(3, device='cuda') + # invoke jitted function like a regular python function + result = jitted_fn(a, b) # using default val=0.0 + + Jiterator can be used together with python registration to override an operator's cuda kernel. + Following example is overriding gelu's cuda kernel with relu. + + Example:: + + code_string = "template T my_gelu(T a) { return a > 0 ? a : 0; }" + my_gelu = create_jit_fn(code_string) + my_lib = torch.library.Library("aten", "IMPL") + my_lib.impl('aten::gelu', my_gelu, "CUDA") + # torch.nn.GELU and torch.nn.function.gelu are now overridden + a = torch.rand(3, device='cuda') + torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a)) + + .. warning:: + This API is in beta and may change in future releases. + + .. warning:: + This API only supports up to 8 inputs and 1 output + + .. warning:: + All input tensors must live in CUDA device + """ + return _JittedFunction(code_string, return_by_ref=False, num_outputs=1, **kwargs) + + +def _create_multi_output_jit_fn( + code_string: str, num_outputs: int, **kwargs +) -> Callable: + """ + Create a jiterator-generated cuda kernel for an elementwise op that supports returning one or more outputs. + + Args: + code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return value by reference. + num_outputs(int): number of outputs return by the kernel + kwargs (Dict, optional): Keyword arguments for generated function + + Example:: + + code_string = "template void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }" + jitted_fn = create_jit_fn(code_string, alpha=1.0) + a = torch.rand(3, device='cuda') + b = torch.rand(3, device='cuda') + # invoke jitted function like a regular python function + result = jitted_fn(a, b, alpha=3.14) + + .. warning:: + This API is in beta and may change in future releases. + + .. warning:: + This API only supports up to 8 inputs and 8 outputs + """ + return _JittedFunction( + code_string, return_by_ref=True, num_outputs=num_outputs, **kwargs + ) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py new file mode 100644 index 0000000000000000000000000000000000000000..05751ab5f87b7042426454e83541d3bebe1861fc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py @@ -0,0 +1,137 @@ +import collections +import warnings +from typing import Optional, Sequence, Union + +import torch.cuda + + +__all__ = ["all_reduce", "reduce", "broadcast", "all_gather", "reduce_scatter"] + +SUM = 0 # ncclRedOp_t + + +def is_available(tensors): + if not hasattr(torch._C, "_nccl_all_reduce"): + warnings.warn("PyTorch is not compiled with NCCL support") + return False + + devices = set() + for tensor in tensors: + if tensor.is_sparse: + return False + if not tensor.is_contiguous(): + return False + if not tensor.is_cuda: + return False + device = tensor.get_device() + if device in devices: + return False + devices.add(device) + + return True + + +def version(): + ver = torch._C._nccl_version() + major = ver >> 32 + minor = (ver >> 16) & 65535 + patch = ver & 65535 + suffix = torch._C._nccl_version_suffix().decode("utf-8") + if suffix == "": + return (major, minor, patch) + else: + return (major, minor, patch, suffix) + + +def unique_id(): + return torch._C._nccl_unique_id() + + +def init_rank(num_ranks, uid, rank): + return torch._C._nccl_init_rank(num_ranks, uid, rank) + + +def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None: + if not isinstance(inputs, collections.abc.Container) or isinstance( + inputs, torch.Tensor + ): + raise TypeError("Inputs should be a collection of tensors") + + +def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None): + _check_sequence_type(inputs) + if outputs is None: + outputs = inputs + _check_sequence_type(outputs) + torch._C._nccl_all_reduce(inputs, outputs, op, streams, comms) + + +# `output` used to be `outputs`, taking in a list of tensors. So we have two +# arguments for BC reasons. +def reduce( + inputs: Sequence[torch.Tensor], + output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None, + root: int = 0, + op: int = SUM, + streams: Optional[Sequence[torch.cuda.Stream]] = None, + comms=None, + *, + outputs: Optional[Sequence[torch.Tensor]] = None, +) -> None: + _check_sequence_type(inputs) + _output: torch.Tensor + if outputs is not None: + if output is not None: + raise ValueError( + "'output' and 'outputs' can not be both specified. 'outputs' is deprecated in " + "favor of 'output', taking in a single output tensor. The signature of reduce is: " + "reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None)." + ) + else: + warnings.warn( + "nccl.reduce with an output tensor list is deprecated. " + "Please specify a single output tensor with argument 'output' instead instead." + ) + _output = outputs[root] + elif not isinstance(output, torch.Tensor) and isinstance( + output, collections.abc.Sequence + ): + # User called old API with positional arguments of list of output tensors. + warnings.warn( + "nccl.reduce with an output tensor list is deprecated. " + "Please specify a single output tensor." + ) + _output = output[root] + else: + _output = inputs[root] if output is None else output + torch._C._nccl_reduce(inputs, _output, root, op, streams, comms) + + +def broadcast( + inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None +) -> None: + _check_sequence_type(inputs) + torch._C._nccl_broadcast(inputs, root, streams, comms) + + +def all_gather( + inputs: Sequence[torch.Tensor], + outputs: Sequence[torch.Tensor], + streams=None, + comms=None, +) -> None: + _check_sequence_type(inputs) + _check_sequence_type(outputs) + torch._C._nccl_all_gather(inputs, outputs, streams, comms) + + +def reduce_scatter( + inputs: Sequence[torch.Tensor], + outputs: Sequence[torch.Tensor], + op: int = SUM, + streams=None, + comms=None, +) -> None: + _check_sequence_type(inputs) + _check_sequence_type(outputs) + torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..51c8aa46f714b6a9fd30857c9edb575614d52420 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py @@ -0,0 +1,61 @@ +import contextlib +import tempfile + +import torch +from . import check_error, cudart + +__all__ = ["init", "start", "stop", "profile"] + +DEFAULT_FLAGS = [ + "gpustarttimestamp", + "gpuendtimestamp", + "gridsize3d", + "threadblocksize", + "streamid", + "enableonstart 0", + "conckerneltrace", +] + + +def init(output_file, flags=None, output_mode="key_value"): + rt = cudart() + if not hasattr(rt, "cudaOutputMode"): + raise AssertionError("HIP does not support profiler initialization!") + if ( + hasattr(torch.version, "cuda") + and torch.version.cuda is not None + and int(torch.version.cuda.split(".")[0]) >= 12 + ): + # Check https://github.com/pytorch/pytorch/pull/91118 + # cudaProfilerInitialize is no longer needed after CUDA 12 + raise AssertionError("CUDA12+ does not need profiler initialization!") + flags = DEFAULT_FLAGS if flags is None else flags + if output_mode == "key_value": + output_mode_enum = rt.cudaOutputMode.KeyValuePair + elif output_mode == "csv": + output_mode_enum = rt.cudaOutputMode.CSV + else: + raise RuntimeError( + "supported CUDA profiler output modes are: key_value and csv" + ) + with tempfile.NamedTemporaryFile(delete=True) as f: + f.write(b"\n".join(f.encode("ascii") for f in flags)) + f.flush() + check_error(rt.cudaProfilerInitialize(f.name, output_file, output_mode_enum)) + + +def start(): + check_error(cudart().cudaProfilerStart()) + + +def stop(): + check_error(cudart().cudaProfilerStop()) + + +@contextlib.contextmanager +def profile(): + try: + start() + yield + finally: + stop() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py new file mode 100644 index 0000000000000000000000000000000000000000..1cf33114d17bd1867dfc5e5bb9179670291878a2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py @@ -0,0 +1,179 @@ +from typing import Iterable, List, Union + +import torch +from .. import Tensor +from . import _lazy_call, _lazy_init, current_device, device_count + +__all__ = [ + "get_rng_state", + "get_rng_state_all", + "set_rng_state", + "set_rng_state_all", + "manual_seed", + "manual_seed_all", + "seed", + "seed_all", + "initial_seed", +] + + +def get_rng_state(device: Union[int, str, torch.device] = "cuda") -> Tensor: + r"""Return the random number generator state of the specified GPU as a ByteTensor. + + Args: + device (torch.device or int, optional): The device to return the RNG state of. + Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device). + + .. warning:: + This function eagerly initializes CUDA. + """ + _lazy_init() + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device("cuda", device) + idx = device.index + if idx is None: + idx = current_device() + default_generator = torch.cuda.default_generators[idx] + return default_generator.get_state() + + +def get_rng_state_all() -> List[Tensor]: + r"""Return a list of ByteTensor representing the random number states of all devices.""" + results = [] + for i in range(device_count()): + results.append(get_rng_state(i)) + return results + + +def set_rng_state( + new_state: Tensor, device: Union[int, str, torch.device] = "cuda" +) -> None: + r"""Set the random number generator state of the specified GPU. + + Args: + new_state (torch.ByteTensor): The desired state + device (torch.device or int, optional): The device to set the RNG state. + Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device). + """ + with torch._C._DisableFuncTorch(): + new_state_copy = new_state.clone(memory_format=torch.contiguous_format) + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device("cuda", device) + + def cb(): + idx = device.index + if idx is None: + idx = current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.set_state(new_state_copy) + + _lazy_call(cb) + + +def set_rng_state_all(new_states: Iterable[Tensor]) -> None: + r"""Set the random number generator state of all devices. + + Args: + new_states (Iterable of torch.ByteTensor): The desired state for each device. + """ + for i, state in enumerate(new_states): + set_rng_state(state, i) + + +def manual_seed(seed: int) -> None: + r"""Set the seed for generating random numbers for the current GPU. + + It's safe to call this function if CUDA is not available; in that + case, it is silently ignored. + + Args: + seed (int): The desired seed. + + .. warning:: + If you are working with a multi-GPU model, this function is insufficient + to get determinism. To seed all GPUs, use :func:`manual_seed_all`. + """ + seed = int(seed) + + def cb(): + idx = current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.manual_seed(seed) + + _lazy_call(cb, seed=True) + + +def manual_seed_all(seed: int) -> None: + r"""Set the seed for generating random numbers on all GPUs. + + It's safe to call this function if CUDA is not available; in that + case, it is silently ignored. + + Args: + seed (int): The desired seed. + """ + seed = int(seed) + + def cb(): + for i in range(device_count()): + default_generator = torch.cuda.default_generators[i] + default_generator.manual_seed(seed) + + _lazy_call(cb, seed_all=True) + + +def seed() -> None: + r"""Set the seed for generating random numbers to a random number for the current GPU. + + It's safe to call this function if CUDA is not available; in that + case, it is silently ignored. + + .. warning:: + If you are working with a multi-GPU model, this function will only initialize + the seed on one GPU. To initialize all GPUs, use :func:`seed_all`. + """ + + def cb(): + idx = current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.seed() + + _lazy_call(cb) + + +def seed_all() -> None: + r"""Set the seed for generating random numbers to a random number on all GPUs. + + It's safe to call this function if CUDA is not available; in that + case, it is silently ignored. + """ + + def cb(): + random_seed = 0 + seeded = False + for i in range(device_count()): + default_generator = torch.cuda.default_generators[i] + if not seeded: + default_generator.seed() + random_seed = default_generator.initial_seed() + seeded = True + else: + default_generator.manual_seed(random_seed) + + _lazy_call(cb) + + +def initial_seed() -> int: + r"""Return the current random seed of the current GPU. + + .. warning:: + This function eagerly initializes CUDA. + """ + _lazy_init() + idx = current_device() + default_generator = torch.cuda.default_generators[idx] + return default_generator.initial_seed() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..f37a34118d2d8f73437dee54337a666df1b99a09 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py @@ -0,0 +1 @@ +# The Tensor classes are added to this module by python_tensor.cpp diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py new file mode 100644 index 0000000000000000000000000000000000000000..22d541f4e2879ace0cc78766d8a8c1795b0bc4a1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py @@ -0,0 +1,241 @@ +import ctypes + +import torch +from torch._streambase import _EventBase, _StreamBase +from .._utils import _dummy_type + + +if not hasattr(torch._C, "_CudaStreamBase"): + # Define dummy base classes + torch._C.__dict__["_CudaStreamBase"] = _dummy_type("_CudaStreamBase") + torch._C.__dict__["_CudaEventBase"] = _dummy_type("_CudaEventBase") + + +class Stream(torch._C._CudaStreamBase, _StreamBase): + r"""Wrapper around a CUDA stream. + + A CUDA stream is a linear sequence of execution that belongs to a specific + device, independent from other streams. See :ref:`cuda-semantics` for + details. + + Args: + device(torch.device or int, optional): a device on which to allocate + the stream. If :attr:`device` is ``None`` (default) or a negative + integer, this will use the current device. + priority(int, optional): priority of the stream, should be 0 or + negative, where negative numbers indicate higher priority. By default, + streams have priority 0. + + """ + + def __new__(cls, device=None, priority=0, **kwargs): + # setting device manager is expensive, so we avoid it unless necessary + if device is None or ("stream_id" in kwargs and "device_index" in kwargs): + return super().__new__(cls, priority=priority, **kwargs) + else: + with torch.cuda.device(device): + return super().__new__(cls, priority=priority, **kwargs) + + def wait_event(self, event): + r"""Make all future work submitted to the stream wait for an event. + + Args: + event (torch.cuda.Event): an event to wait for. + + .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see + `CUDA Stream documentation`_ for more info. + + This function returns without waiting for :attr:`event`: only future + operations are affected. + + .. _CUDA Stream documentation: + https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html + """ + event.wait(self) + + def wait_stream(self, stream): + r"""Synchronize with another stream. + + All future work submitted to this stream will wait until all kernels + submitted to a given stream at the time of call complete. + + Args: + stream (Stream): a stream to synchronize. + + .. note:: This function returns without waiting for currently enqueued + kernels in :attr:`stream`: only future operations are affected. + """ + self.wait_event(stream.record_event()) + + def record_event(self, event=None): + r"""Record an event. + + Args: + event (torch.cuda.Event, optional): event to record. If not given, a new one + will be allocated. + + Returns: + Recorded event. + """ + if event is None: + event = Event() + event.record(self) + return event + + def query(self): + r"""Check if all the work submitted has been completed. + + Returns: + A boolean indicating if all kernels in this stream are completed. + """ + return super().query() + + def synchronize(self): + r"""Wait for all the kernels in this stream to complete. + + .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see + `CUDA Stream documentation`_ for more info. + """ + super().synchronize() + + @property + def _as_parameter_(self): + return ctypes.c_void_p(self.cuda_stream) + + def __eq__(self, o): + if isinstance(o, Stream): + return super().__eq__(o) + return False + + def __hash__(self): + return hash((self.cuda_stream, self.device)) + + def __repr__(self): + return f"" + + +class ExternalStream(Stream): + r"""Wrapper around an externally allocated CUDA stream. + + This class is used to wrap streams allocated in other libraries in order + to facilitate data exchange and multi-library interactions. + + .. note:: This class doesn't manage the stream life-cycle, it is the user + responsibility to keep the referenced stream alive while this class is + being used. + + Args: + stream_ptr(int): Integer representation of the `cudaStream_t` value. + allocated externally. + device(torch.device or int, optional): the device where the stream + was originally allocated. if device is specified incorrectly, + subsequent launches using this stream may fail. + """ + + def __new__(cls, stream_ptr, device=None, **kwargs): + with torch.cuda.device(device): + return super().__new__(cls, stream_ptr=stream_ptr, **kwargs) + + +class Event(torch._C._CudaEventBase, _EventBase): + r"""Wrapper around a CUDA event. + + CUDA events are synchronization markers that can be used to monitor the + device's progress, to accurately measure timing, and to synchronize CUDA + streams. + + The underlying CUDA events are lazily initialized when the event is first + recorded or exported to another process. After creation, only streams on the + same device may record the event. However, streams on any device can wait on + the event. + + Args: + enable_timing (bool, optional): indicates if the event should measure time + (default: ``False``) + blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``) + interprocess (bool): if ``True``, the event can be shared between processes + (default: ``False``) + + .. _CUDA Event Documentation: + https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html + """ + + def __new__(cls, enable_timing=False, blocking=False, interprocess=False): + return super().__new__( + cls, + enable_timing=enable_timing, + blocking=blocking, + interprocess=interprocess, + ) + + @classmethod + def from_ipc_handle(cls, device, handle): + r"""Reconstruct an event from an IPC handle on the given device.""" + return super().from_ipc_handle(device, handle) + + def record(self, stream=None): + r"""Record the event in a given stream. + + Uses ``torch.cuda.current_stream()`` if no stream is specified. The + stream's device must match the event's device. + """ + if stream is None: + stream = torch.cuda.current_stream() + super().record(stream) + + def wait(self, stream=None): + r"""Make all future work submitted to the given stream wait for this event. + + Use ``torch.cuda.current_stream()`` if no stream is specified. + + .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see + `CUDA Event documentation`_ for more info. + """ + if stream is None: + stream = torch.cuda.current_stream() + super().wait(stream) + + def query(self): + r"""Check if all work currently captured by event has completed. + + Returns: + A boolean indicating if all work currently captured by event has + completed. + """ + return super().query() + + def elapsed_time(self, end_event): + r"""Return the time elapsed. + + Time reported in milliseconds after the event was recorded and + before the end_event was recorded. + """ + return super().elapsed_time(end_event) + + def synchronize(self): + r"""Wait for the event to complete. + + Waits until the completion of all work currently captured in this event. + This prevents the CPU thread from proceeding until the event completes. + + .. note:: This is a wrapper around ``cudaEventSynchronize()``: see + `CUDA Event documentation`_ for more info. + """ + super().synchronize() + + def ipc_handle(self): + r"""Return an IPC handle of this event. + + If not recorded yet, the event will use the current device. + """ + return super().ipc_handle() + + @property + def _as_parameter_(self): + return ctypes.c_void_p(self.cuda_event) + + def __repr__(self): + if self.cuda_event: + return f"" + else: + return "" diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d705a9fb249d7cdce9053464a2e60a05465676d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ffc0c68d05004bf90d8fc1df6986361e58bbca7 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53c8e3715798203db22746633ead7109dca0f29e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..457c770debb164ef3d4f97112c4decb46228cfcd Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df824888a7ee5470401861115fb55164a2f50431 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e481f445b31589a93acf94bc403c2ee41fce58ae Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h new file mode 100644 index 0000000000000000000000000000000000000000..effdd469d19b91316aa21ae99d43055f49c950eb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h @@ -0,0 +1,37 @@ +#pragma once + +#if !defined(_MSC_VER) && __cplusplus < 201703L +#error C++17 or later compatible compiler is required to use ATen. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TODO: try to remove this +// There is some back story, see https://github.com/pytorch/pytorch/issues/48684 +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h new file mode 100644 index 0000000000000000000000000000000000000000..0275ef099b03d714b916b9d0d09c4827724bf58c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h @@ -0,0 +1,153 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Defines the accumulation type for a scalar type. +// Example: +// using accscalar_t = acc_type; +// +// Accumulation types are an important concept in numeric computing +// because you frequently want to perform intermediate computations +// at a higher precision than the input and output precision, to avoid +// compounding internal rounding errors. Accumulation is the most +// well-known intermediate computation (it is of great importance for +// sum reduction and matrix multiply, for example), but in PyTorch +// acc_type ends up getting used for all sorts of other intermediate +// computations, so it perhaps would be more accurately (ahem) called an +// "accurate" type. acc_type is especially important for reduced +// precision operations like float16 and bfloat16, where relatively +// benign looking inputs can easily end up overflowing/underflowing. +// +// acc_type is parametrized by whether or not you are running on CUDA +// or not, because on CUDA double precision operations are expensive +// and so by default, we don't actually want to use double as an +// acc_type on CUDA. A lot of things are typed out below, but +// basically, the table is generated by a few rules: +// +// If bool: +// Use 'bool' as acc_type. +// If floating point: +// If CUDA, use 'float' as acc_type (unless scalar_t is double), +// otherwise (CPU) use 'double' +// If integral: +// Use 'int64_t' as acc_type +// +// You're not forced to use this template; if you happen to know +// something specific about your use case, you can specify your own +// desired behavior. This template, however, will give you a reasonable +// default that will work for all dtypes supported in PyTorch. + +#if defined(__CUDACC__) +#include +#include +#elif defined(__HIPCC__) +#include +#include +#endif + +namespace at { + +template +struct AccumulateTypeDevice {}; + +template +struct AccumulateType {}; + +template +struct AccumulateType { + using type = typename AccumulateTypeDevice::type; +}; + +template +struct AccumulateType { + using type = typename AccumulateTypeDevice::type; +}; + +template +using acc_type_device = typename AccumulateTypeDevice::type; + +template +using acc_type = typename AccumulateType::type; + +#define ACC_TYPE(t, acc_t, device_type) \ + template <> \ + struct AccumulateTypeDevice { \ + using type = acc_t; \ + }; +#define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS) +#define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA) +#define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU) + +MPS_ACC_TYPE(BFloat16, float); +MPS_ACC_TYPE(Half, float); +MPS_ACC_TYPE(Float8_e5m2, float); +MPS_ACC_TYPE(Float8_e4m3fn, float); +MPS_ACC_TYPE(Float8_e5m2fnuz, float); +MPS_ACC_TYPE(Float8_e4m3fnuz, float); +MPS_ACC_TYPE(float, float); +MPS_ACC_TYPE(double, float); +MPS_ACC_TYPE(int8_t, int64_t); +MPS_ACC_TYPE(uint8_t, int64_t); +MPS_ACC_TYPE(char, int64_t); +MPS_ACC_TYPE(int16_t, int64_t); +MPS_ACC_TYPE(int32_t, int64_t); +MPS_ACC_TYPE(int64_t, int64_t); +MPS_ACC_TYPE(bool, bool); +MPS_ACC_TYPE(c10::complex, c10::complex); +MPS_ACC_TYPE(c10::complex, c10::complex); +MPS_ACC_TYPE(c10::complex, c10::complex); + +#if defined(__CUDACC__) || defined(__HIPCC__) +CUDA_ACC_TYPE(half, float); +#endif +CUDA_ACC_TYPE(BFloat16, float); +CUDA_ACC_TYPE(Half, float); +CUDA_ACC_TYPE(Float8_e5m2, float); +CUDA_ACC_TYPE(Float8_e4m3fn, float); +CUDA_ACC_TYPE(Float8_e5m2fnuz, float); +CUDA_ACC_TYPE(Float8_e4m3fnuz, float); +CUDA_ACC_TYPE(float, float); +CUDA_ACC_TYPE(double, double); +CUDA_ACC_TYPE(int8_t, int64_t); +CUDA_ACC_TYPE(uint8_t, int64_t); +CUDA_ACC_TYPE(char, int64_t); +CUDA_ACC_TYPE(int16_t, int64_t); +CUDA_ACC_TYPE(int32_t, int64_t); +CUDA_ACC_TYPE(int64_t, int64_t); +CUDA_ACC_TYPE(bool, bool); +CUDA_ACC_TYPE(c10::complex, c10::complex); +CUDA_ACC_TYPE(c10::complex, c10::complex); +CUDA_ACC_TYPE(c10::complex, c10::complex); + +CPU_ACC_TYPE(BFloat16, float); +CPU_ACC_TYPE(Half, float); +CPU_ACC_TYPE(Float8_e5m2, float); +CPU_ACC_TYPE(Float8_e4m3fn, float); +CPU_ACC_TYPE(Float8_e5m2fnuz, float); +CPU_ACC_TYPE(Float8_e4m3fnuz, float); +CPU_ACC_TYPE(float, double); +CPU_ACC_TYPE(double, double); +CPU_ACC_TYPE(int8_t, int64_t); +CPU_ACC_TYPE(uint8_t, int64_t); +CPU_ACC_TYPE(char, int64_t); +CPU_ACC_TYPE(int16_t, int64_t); +CPU_ACC_TYPE(int32_t, int64_t); +CPU_ACC_TYPE(int64_t, int64_t); +CPU_ACC_TYPE(bool, bool); +CPU_ACC_TYPE(c10::complex, c10::complex); +CPU_ACC_TYPE(c10::complex, c10::complex); +CPU_ACC_TYPE(c10::complex, c10::complex); + +TORCH_API c10::ScalarType toAccumulateType( + c10::ScalarType type, + c10::DeviceType device); +TORCH_API c10::ScalarType toAccumulateType(c10::ScalarType type, bool is_cuda); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h new file mode 100644 index 0000000000000000000000000000000000000000..9651469e190085d913ba9b5d1ca02085886fc4e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..cf621f34cc63735d7f7557f48146bb76467b8afc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +// This file creates a fake allocator that just throws exceptions if +// it is actually used. + +// state passed to the allocator is the std::function called +// when the blob is release by ATen + +namespace at { + +static cpu_fixed_malloc(void*, ptrdiff_t) { + AT_ERROR("attempting to resize a tensor view of an external blob"); +} + +static cpu_fixed_realloc(void*, void*, ptrdiff_t) { + AT_ERROR("attempting to resize a tensor view of an external blob"); +} + +static cpu_fixed_free(void* state, void* allocation) { + auto on_release = static_cast*>(state); + (*on_release)(allocation); + delete on_release; +} + +static Allocator CPU_fixed_allocator = { + cpu_fixed_malloc, + cpu_fixed_realloc, + cpu_fixed_free}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h new file mode 100644 index 0000000000000000000000000000000000000000..4e25112e7d4490096e6340184a3a4813511f93b6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h @@ -0,0 +1,94 @@ +#include +#include + +namespace at { + +/* +[collapse dims] Updates sizes, and strides to reflect a "collapse" of +the info, possibly excluding the optional excludeDim. A "collapsed" version +of the info is the fewest dims that order the tensor's elements in the same +way as the original info. If excludeDim is specified, the collapse is the +fewest dims that order the tensor's elements as the original and preserve the +excluded dimension, unless the tensor collapses to a point. + +This function returns a pair of values. + +1) The (new) index of the preserved dimension if excludeDim is +specified. 0 if the tensor is collapsed to a point. -1 +otherwise. + +2) The new number of dimensions. +*/ +template +inline std::pair collapse_dims( + T* sizes, + T* strides, + int64_t dims, + const int excludeDim = -1) { + TORCH_CHECK( + excludeDim >= -1 && excludeDim < dims, + "expected excluded dim between -1 and dims - 1"); + + int64_t stopDim = (excludeDim == -1) ? dims : excludeDim; + int64_t newIndex = -1; + int64_t oldIndex = 0; + int64_t remappedExcludedDim = -1; + + while (oldIndex < dims) { + // Finds a dimension to collapse into + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + ++oldIndex; + break; + } + + // Collapses dims + for (; oldIndex < stopDim; ++oldIndex) { + if (sizes[oldIndex] == 1) { + continue; + } + + if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) { + sizes[newIndex] *= sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } else { + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + } + } + + // Handles excludeDim being set (oldIndex == excludeDim) + if (oldIndex != dims) { + // Preserves excluded dimension + ++newIndex; + sizes[newIndex] = sizes[oldIndex]; + strides[newIndex] = strides[oldIndex]; + remappedExcludedDim = newIndex; + + // Restarts iteration after excludeDim + ++oldIndex; + stopDim = dims; + } + } + + // Handles special case of all dims size 1 + if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) { + dims = 1; + sizes[0] = 1; + strides[0] = 1; + + return std::pair(0, 1); + } + + dims = newIndex + 1; + return std::pair(remappedExcludedDim, dims); +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..fde0a471ac0135f1dcb55f78e10d0818c5cff2e2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h @@ -0,0 +1,29 @@ +#include + +// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch] +// Code introduced to avoid cyclic dependency in static dispatch is no longer +// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place, +// to Operators.cpp for supporting multiple backends with multiple kernels. +// +// Note [Avoiding Include Cycles In Static Dispatch] +// In order to avoid #include cycles in the static dispatch build, we've carefully split out +// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h. +// +// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h. +// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods +// all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all +// directly inlined into TensorBody.h. +// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API, +// which include functions that have defaultable optional arguments. +// That requires knowing the full Tensor class definition. +// +// We break the cycle by doing the following: +// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h +// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl., +// - CPUFunctions_inl.h includes everything else +// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class, +// and then it includes CPUFunctions_inl.h. +// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly. +// - This also means that static dispatch build, CPUFunctions.h only needs to +// #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h. +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..90ffa6b1eb4a9cc5d64851784113a739e385a77e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + . \ + See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +#include +#include +#include +#include + + + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..35ba9f5284376f0d873b33b3f75f7d0608b0dae7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h @@ -0,0 +1,808 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include // For CUDA_VERSION +#endif + +#ifdef TEMPLATE_SELECTIVE_BUILD +#include +#else +namespace at { +/** + * The method should_include_kernel_dtype() returns true/false + * based on whether the switching code for a specific dtype should be + * included based on build time constants generated from tracing model + * execution. This method will be implmeneted via code-generation and + * included in this file when code-gen is ready. + */ +inline constexpr bool should_include_kernel_dtype( + const char* /*kernel_tag_str*/, + at::ScalarType /*scalar_type*/ +) { + return true; +} +} // namespace at +#endif + +/** + * In the Facebook internal build (using BUCK), this macro is enabled by + * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer + * binary. + */ +#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE +namespace at { +namespace detail { +TORCH_API void record_kernel_function_dtype(std::string name); +} +} // namespace at + +#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) \ + at::detail::record_kernel_function_dtype( \ + std::string(NAME) + "$" + toString(enum_type)); +#else +#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) +#endif + +#define AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) \ + do { \ + if constexpr (!at::should_include_kernel_dtype( \ + at_dispatch_name, enum_type)) { \ + AT_ERROR( \ + "dtype '", \ + toString(enum_type), \ + "' not selected for kernel tag ", \ + at_dispatch_name); \ + } \ + } while (0) + +#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \ + case enum_type: { \ + AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \ + using HINT C10_UNUSED = c10::impl::ScalarTypeToCPPTypeT; \ + return __VA_ARGS__(); \ + } + +#define AT_DISPATCH_CASE(enum_type, ...) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__) + +#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...) \ + case enum_type: { \ + AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \ + using scalar_t = scalar_type; \ + using underlying_t C10_UNUSED = typename scalar_t::underlying; \ + const auto& SCALAR_TYPE C10_UNUSED = enum_type; \ + const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \ + return __VA_ARGS__(); \ + } + +#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + enum_type, scalar_type, bitwidth, qmin, qmax, ...) \ + case enum_type: { \ + AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \ + using scalar_t = scalar_type; \ + using underlying_t C10_UNUSED = typename scalar_t::underlying; \ + const auto& SCALAR_TYPE C10_UNUSED = enum_type; \ + const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \ + C10_UNUSED int bit_width = bitwidth; \ + C10_UNUSED int64_t quant_min = qmin; \ + C10_UNUSED int64_t quant_max = qmax; \ + return __VA_ARGS__(); \ + } + +namespace detail { + +inline at::ScalarType scalar_type(at::ScalarType s) { + return s; +} + +C10_DEPRECATED_MESSAGE( + "passing at::DeprecatedTypeProperties to an AT_DISPATCH macro is deprecated, " + "pass an at::ScalarType instead") +inline at::ScalarType scalar_type(const at::DeprecatedTypeProperties& t) { + return t.scalarType(); +} + +C10_DEPRECATED_MESSAGE( + "AT_DISPATCH_ALL_TYPES_AND_HALF is deprecated, " + "use AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ...) instead") +inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF() {} + +C10_DEPRECATED_MESSAGE( + "AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX is deprecated, " + "use AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, ...) " + "instead") +inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} + +} // namespace detail + +// The AT_DISPATCH_* family of macros provides the ability to +// conveniently generate specializations of a kernel over all of the +// dtypes we care about in PyTorch. We call it "dispatch" because +// we are "dispatching" to the correct, dtype-specific kernel. +// +// A standard usage looks like: +// +// AT_DISPATCH_ALL_TYPES(self.scalar_type(), "op_name", [&] { +// // Your code here, with 'scalar_t' now defined to +// // be the dtype in question +// }); +// +// There are many variations of this macro, so it's important to +// understand exactly /which/ dtypes you want to get instantiated, as +// well as what the "default" set is. +// +// The default set of dtypes that are instantiated (e.g., by +// AT_DISPATCH_ALL_TYPES) are floating point types (float, double), +// and integral types (int32_t, int64_t, int16_t, int8_t, uint8_t), +// but NOT booleans (bool), half-precision floats (Half) or +// complex number (c10::complex, c10::complex). +// This "cut" is somewhat historical (the default types are the +// ones that TH historically supported), but it also reflects the +// fact that the non-default types are "poorly" behaved (booleans +// are NOT integers mod 2, half precision operations ~essentially +// don't exist on CPU, complex numbers are an experimental application). +// +// Here are the questions you should generally ask to decide which +// dispatch you want: +// +// 1. Is this an integral or floating point specific operation? +// (If so, you'll want one of the FLOATING or INTEGRAL macros.) +// +// 2. Should half be supported? (If you're on CPU, the answer is almost +// definitely no. If you do want support, use one of the AND_HALF +// macros) +// +// Much rarer situations: +// +// 3. Should bool be supported? (You often have to write your kernel +// differently if arithmetic operations are involved.) If so, +// Use AT_DISPATCH_ALL_TYPES_AND along with ScalarType::Bool +// +// 4. Should complex be supported? The answer is almost always no, +// unless you are working on "generic" code that should work on +// all dtypes. +// +// Parameters: +// ----------- +// +// 1. The NAME argument is a "tag" that is used to trace and then +// conditionally compile fragments of the case statements such +// that the kernel functions are specialized only for the dtypes +// that are needed. The NAME parameter *must* be a build time +// const char* (can't be std::string, etc...) +// +// Please ensure that the NAME is unique for every implementation +// or you run the risk of over-including code for the kernel +// functions. There is no risk of missing out on any code, so +// it's mostly a risk of a Type-2 error, and not a Type-1 error. +// +// Switch-like syntax: +// ------------------- +// There is also a switch-case like syntax which is useful if a kernel +// needs to be specialized for particular scalar types +// +// AT_DISPATCH_SWITCH(self.scalar_type(), "op_name", +// AT_DISPATCH_CASE_INTEGRAL_TYPES([&] { +// op_integral(iter); +// }) +// AT_DISPATCH_CASE_FLOATING_TYPES([&] { +// op_floating(iter); +// }) +// AT_DISPATCH_CASE(kBool, [&] { +// op_bool(iter); +// }) +// ); +// +// For each AT_DISPATCH_FOO macro, there is a corresponding +// AT_DISPATCH_CASE_FOO macro which can be used inside of an +// AT_DISPATCH_SWITCH block. + +// NB: the the_type variable is not used, but we have kept it for +// backwards compatibility. It's probably not used by anyone though; +// but we're just being safe (and it doesn't hurt.) Note we must +// use it to shut up warnings about unused store. + +#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + constexpr const char* at_dispatch_name = NAME; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \ + switch (_st) { \ + __VA_ARGS__ \ + default: \ + AT_ERROR( \ + '"', \ + at_dispatch_name, \ + "\" not implemented for '", \ + toString(_st), \ + "'"); \ + } \ + }() + +#define AT_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(...) \ + AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define AT_DISPATCH_REDUCED_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES_AND2( \ + SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_TYPES_AND2( \ + SCALARTYPE1, SCALARTYPE2, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_COMPLEX_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__) + +#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_COMPLEX_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(...) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( \ + SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1( \ + SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2( \ + SCALARTYPE1, SCALARTYPE2, ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( \ + SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2( \ + SCALARTYPE1, SCALARTYPE2, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND5( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + ...) \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__) + +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_INTEGRAL_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) + +#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_INTEGRAL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES(...) \ + AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_QINT_TYPES(...) \ + AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__) \ + AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__) \ + AT_DISPATCH_CASE_QINT(at::kQInt32, at::qint32, __VA_ARGS__) + +#define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_QINT_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_QINT_BYTE_TYPES(...) \ + AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__) \ + AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__) + +#define AT_DISPATCH_QINT_BYTE_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_BYTE_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(...) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQInt8, at::qint8, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQUInt8, at::quint8, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQInt32, \ + at::qint32, \ + CHAR_BIT * sizeof(int), \ + INT_MIN, \ + INT_MAX, \ + __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQUInt4x2, at::quint4x2, 4, 0, 15, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQUInt2x4, at::quint2x4, 2, 0, 3, __VA_ARGS__) + +#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2( \ + SCALARTYPE1, SCALARTYPE2, ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( \ + SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2( \ + SCALARTYPE1, SCALARTYPE2, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \ + AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND5( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND7( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + SCALARTYPE8, \ + ...) \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__) \ + AT_DISPATCH_CASE(SCALARTYPE8, __VA_ARGS__) + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND8( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + SCALARTYPE8, \ + TYPE, \ + NAME, \ + ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + SCALARTYPE8, \ + __VA_ARGS__)) + +#define AT_DISPATCH_CASE_BIT_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Bits1x8, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Bits2x4, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Bits4x2, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Bits8, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Bits16, __VA_ARGS__) + +#define AT_DISPATCH_BIT_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_BIT_TYPES(__VA_ARGS__)) + +#define AT_DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Int, index_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Long, index_t, __VA_ARGS__)) + +// ---------------------------------------------------------------------------- +// DEPRECATED MACROS, DON'T USE THESE +// ---------------------------------------------------------------------------- + +#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \ + detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF(); \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE_ALL_TYPES_AND(at::ScalarType::Half, __VA_ARGS__)) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..e0764834c02fdea27dd56cd08a283f772bcd7938 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h @@ -0,0 +1,186 @@ +#include + +// This is a new implementation of the AT_DISPATCH macro family from +// ATen/Dispatch.h +// +// The intended usage is: +// +// ScalarType scalar_type; +// +// AT_DISPATCH_V2( +// scalar_type, +// "debug string", +// AT_WRAP([&] { +// ... code to specialize with scalar_t ... +// }), +// kHalf, +// AT_EXPAND(AT_ALL_TYPES), +// ... as many types arguments as needed ... +// ) +// +// For example, given an old style: +// +// AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( +// kComplexHalf, +// kHalf, +// self.scalar_type(), +// "_local_scalar_dense_cpu", +// [&] { +// scalar_t value = *self.data_ptr(); +// r = Scalar(value); +// } +// ) +// +// You now write: +// +// AT_DISPATCH_V2( +// self.scalar_type(), +// "_local_scalar_dense_cpu", +// AT_WRAP([&] { +// scalar_t value = *self.data_ptr(); +// r = Scalar(value); +// }), +// AT_EXPAND(AT_ALL_TYPES), +// AT_EXPAND(AT_COMPLEX_TYPES), +// kComplexHalf, +// kHalf, +// ) +// +// Notably, it sports the following improvements: +// +// - It is not necessary to specify the arity (e.g., +// AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3,4,...}) +// when using the macro +// +// - It is not necessary to specify each dtype individually; if +// there is a set of related dtypes and you want to dispatch +// over all of them, you can simply say, e.g., AT_EXPAND(AT_INTEGRAL_TYPES) +// in your argument list. +// +// However, you must remember to wrap the payload body in AT_WRAP, or commas +// inside your lambda will be improperly handled. Furthermore, if you more +// entries to ScalarType than can be supported by this macro, it will fail +// with an obscure error (due to attempting to concatenate AT_AP with +// something that is not a number). +// +// The implementation strategy is to use the count arguments trick +// (e.g., as described in https://stackoverflow.com/a/2124385/23845) +// to discover how many dtypes have been passed, and then dispatch to a +// hand-written macro for each arity that applies as many DISPATCH_CASE as +// necessary. The hand-written macros can be regenerated for other arities +// with the script below. +// +// There is some delicacy in the implementation in controlling when +// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD. I mostly +// relied on GPT4 to help me get it right. + +// Public API macros + +// See documentation above +#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__)) + +// This macro lets you pass an arbitrary expression that may contain internal +// commas to another macro without having the commas causing the expression +// to be interpreted as being multiple arguments +#define AT_WRAP(...) __VA_ARGS__ + +#define AT_FLOAT8_TYPES \ + c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \ + c10::kFloat8_e4m3fnuz + +#define AT_INTEGRAL_TYPES \ + c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort +#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat +#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64 +#define AT_INTEGRAL_TYPES_V2 \ + AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) +#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat +#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32 +// NB: not *actually* all types +#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES) +#define AT_ALL_TYPES_AND_COMPLEX \ + AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES) + +// Helper macros + +#define AT_AP_VAR(N, T, ...) \ + AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__)) +#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b) +#define AT_CONCAT_AUX(a, b) a##b +#define AT_EXPAND(X) X + +// Ensure we never have too many scalar types for the expansion here to +// support. To bump this, you must regenerate the macros below. +static_assert(static_cast(c10::ScalarType::NumOptions) < 45); + +// Python code to regenerate generate code below: +#if 0 + +num_args = 45 + +nums = ', '.join(str(i) for i in reversed(range(num_args+1))) +args = ', '.join(f'_{i}' for i in range(1, num_args+1)) + +print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))') +print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N') + +for i in range(1, num_args+1): + args = ', '.join(f'_{i}' for i in range(1, i+1)) + cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)]) + print(f'#define AT_AP{i}(N, {args}) {cases}') + +#endif + +// Begin generated code +// clang-format off + +#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) +#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, N, ...) N +#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N) +#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) +#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) +#define AT_AP4(N, _1, _2, _3, _4) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) +#define AT_AP5(N, _1, _2, _3, _4, _5) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) +#define AT_AP6(N, _1, _2, _3, _4, _5, _6) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) +#define AT_AP7(N, _1, _2, _3, _4, _5, _6, _7) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) +#define AT_AP8(N, _1, _2, _3, _4, _5, _6, _7, _8) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) +#define AT_AP9(N, _1, _2, _3, _4, _5, _6, _7, _8, _9) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) +#define AT_AP10(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) +#define AT_AP11(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) +#define AT_AP12(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) +#define AT_AP13(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) +#define AT_AP14(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) +#define AT_AP15(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) +#define AT_AP16(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) +#define AT_AP17(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) +#define AT_AP18(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) +#define AT_AP19(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) +#define AT_AP20(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) +#define AT_AP21(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) +#define AT_AP22(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) +#define AT_AP23(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) +#define AT_AP24(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) +#define AT_AP25(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) +#define AT_AP26(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) +#define AT_AP27(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) +#define AT_AP28(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) +#define AT_AP29(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) +#define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) +#define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) +#define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) +#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) +#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) +#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) +#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) +#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) +#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) +#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) +#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) +#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) +#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) +#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) +#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) +#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) +// End generated code +// clang-format on diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..5f8681ce37f960b953e6d8dcc50c657c69f1c536 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h @@ -0,0 +1,160 @@ +#pragma once +#include + +namespace at::detail { + +inline void check_size_nonnegative(ArrayRef size) { + for (const auto& x : size) { + TORCH_CHECK( + x >= 0, + "Trying to create tensor with negative dimension ", + x, + ": ", + size); + } +} + +inline void check_size_nonnegative(ArrayRef size) { + for (const auto& x : size) { + TORCH_CHECK( + x.expect_size(__FILE__, __LINE__), + "Trying to create tensor with negative dimension ", + x, + ": ", + size); + } +} + +TORCH_API size_t computeStorageNbytesContiguous( + IntArrayRef sizes, + size_t itemsize, + size_t storage_offset = 0); +TORCH_API SymInt computeStorageNbytesContiguous( + SymIntArrayRef sizes, + const SymInt& itemsize, + const SymInt& storage_offset = 0); +TORCH_API size_t computeStorageNbytes( + IntArrayRef sizes, + IntArrayRef strides, + size_t itemsize, + size_t storage_offset = 0); +TORCH_API SymInt computeStorageNbytes( + SymIntArrayRef sizes, + SymIntArrayRef strides, + const SymInt& itemsize, + const SymInt& storage_offset = 0); + +TORCH_API TensorBase empty_generic( + IntArrayRef size, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type, + c10::optional memory_format_opt); + +TORCH_API TensorBase empty_strided_generic( + IntArrayRef size, + IntArrayRef stride, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type); + +TORCH_API TensorBase empty_strided_symint_generic( + SymIntArrayRef size, + SymIntArrayRef stride, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type); + +TORCH_API TensorBase empty_cpu( + IntArrayRef size, + ScalarType dtype, + bool pin_memory = false, + c10::optional memory_format_opt = c10::nullopt); + +TORCH_API TensorBase empty_cpu( + IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt); + +TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options); + +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + bool pin_memory = false); + +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + +TORCH_API TensorBase empty_strided_cpu( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions& options); + +TORCH_API TensorBase empty_meta( + IntArrayRef size, + ScalarType dtype, + c10::optional memory_format_opt = c10::nullopt); + +TORCH_API TensorBase empty_meta( + IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt); + +TORCH_API TensorBase empty_symint_meta( + SymIntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt); + +TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options); + +TORCH_API TensorBase +empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype); + +TORCH_API TensorBase empty_strided_meta( + IntArrayRef size, + IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + +TORCH_API TensorBase empty_strided_meta( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions& options); + +TORCH_API TensorBase empty_strided_symint_meta( + SymIntArrayRef size, + SymIntArrayRef stride, + ScalarType dtype); + +TORCH_API TensorBase empty_strided_symint_meta( + SymIntArrayRef size, + SymIntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt); + +TORCH_API TensorBase empty_strided_symint_meta( + SymIntArrayRef size, + SymIntArrayRef stride, + const TensorOptions& options); + +} // namespace at::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h new file mode 100644 index 0000000000000000000000000000000000000000..8db6be6a643c8cb60cab8487478f9a2f0c817d8b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h @@ -0,0 +1,30 @@ +#include + +// Broadcasting utilities for working with TensorBase +namespace at { +namespace internal { +TORCH_API TensorBase expand_slow_path(const TensorBase& self, IntArrayRef size); +} // namespace internal + +inline c10::MaybeOwned expand_size( + const TensorBase& self, + IntArrayRef size) { + if (size.equals(self.sizes())) { + return c10::MaybeOwned::borrowed(self); + } + return c10::MaybeOwned::owned( + at::internal::expand_slow_path(self, size)); +} +c10::MaybeOwned expand_size(TensorBase&& self, IntArrayRef size) = + delete; + +inline c10::MaybeOwned expand_inplace( + const TensorBase& tensor, + const TensorBase& to_expand) { + return expand_size(to_expand, tensor.sizes()); +} +c10::MaybeOwned expand_inplace( + const TensorBase& tensor, + TensorBase&& to_expand) = delete; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..6291b2743459625144e7411b0ebf2c4f0bfaa217 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h @@ -0,0 +1,408 @@ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { + +// Note [Functionalization Pass In Core] +// The Functionalization pass is used to remove aliasing from a pytorch program. +// +// This is useful for backends that don't support aliasing, like XLA and Vulkan. +// It's also necessary in order to remove mutation from a program, which is +// needed in Functorch. +// +// Consider this program: +// a = torch.ones(...) +// b = a.view(...) +// b.add_(1) +// +// In this program, b is meant to alias with a due to the use of view(). At the +// end of the program, both a and b are full of 2's. However, backends that +// don't support aliasing aren't able to correctly implement the view() +// operator. Instead, they can opt into the Functionalization pass, which will +// sit between the user and the backend, and provide the necessary aliasing +// logic. +// +// The functionalization pass will turn the above program into a slightly +// different program that has the same semantics, transparently to the user, +// that backends like XLA/Vulkan are able to implement a = torch.ones(...) b = +// a.view_copy(...) # view() replaced with view_copy(). Backends like +// XLA/Vulkan can implement this! b.add_(1) a.add_(1) # Our functionalization +// pass machinery knows that a and b are aliased - it applies b's mutation to a +// too. +// +// So, how does the functionalization pass keep track of which tensors are +// aliased? The pass works by wrapping EVERY tensor in the program inside of a +// FunctionalTensorWrapper, which knows about its alias'd tensors. +// +// See Note [Functionalization: Alias Removal] for details on the aliasing +// machinery. See Note [Functionalization: Mutation Removal] for details on +// mutation removal. +struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { + explicit FunctionalTensorWrapper(const Tensor& value); + // Additional constructor to create a FunctionalTensorWrapper directly from an + // underlying tensor that was created from a view. For example, the code b = + // a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a, + // view1_meta) + explicit FunctionalTensorWrapper( + const Tensor& view_value, + const FunctionalTensorWrapper* base, + const functionalization::ViewMeta& meta); + + // Get the underlying, actual tensor, that doesn't know anything about + // functionalization. + const Tensor& value() const { + return value_; + }; + // The concept of "level" is only ever important to functorch; it's exposed + // here as more of a hook for functorch to use. + int64_t level() const { + return level_; + }; + void set_level(int64_t level) { + level_ = level; + } + bool has_metadata_mutation() const { + return has_metadata_mutation_; + }; + + // Denotes a mutation that's hidden from autograd, + // e.g. for the purposes of passing a tensor to a triton kernel + void mark_mutation_hidden_from_autograd() { + mutation_hidden_from_autograd_counter_++; + } + void mark_mutation_during_no_grad_or_inference_mode() { + mutation_during_no_grad_or_inference_mode_++; + } + // Are all the mutations happening to the tensor hidden from autograd + bool are_all_mutations_hidden_from_autograd() const { + return mutation_hidden_from_autograd_counter_ == mutation_counter_; + } + // Did all mutations happen under no_grad or inference_mode + // (We also need to ignore mutations fully hidden from autograd here) + bool are_all_mutations_under_no_grad_or_inference_mode() const { + return mutation_hidden_from_autograd_counter_ + + mutation_during_no_grad_or_inference_mode_ == + mutation_counter_; + } + + // Sync's the underlying tensor with its alias, if it's out of date. This + // involves two steps: 1) Apply any pending updates/mutations to the alias 2) + // Replay the views (if any) to regenerate the current tensor off of the + // updated alias. + void sync_(); + // Performs step (1) of the sync. This is its own public API because it's + // needed by view_inplace ops like transpose_. See Note [Functionalization + // Pass - Inplace View Ops] + void regenerate_from_base(); + // Performs step (2) of the sync. This is its own public API because it's + // needed by functorch. functorch wants to make sure that all input tensors to + // a functionalized program have been properly synced so it can properly + // propagate mutations to inputs. It can't just call sync_(), because the + // FunctionalTensorWrapper will look like it has no aliases and sync_ will be + // a noop. We use the reference count on storage_ to determine if the wrapper + // is aliased, and by the time functorch is ready to propagate updates to + // inputs, any intermediate views of the input created by the program will + // have been deallocated. This function also returns whether or not the base + // actually had any updates to apply. + bool apply_updates(); + // Takes the current state of value_ and snapshots it, sending it as a pending + // update to the alias. + void commit_update(); + // When any tensor is mutated, the tensor increments its alias's "generation". + // Separately, each tensor maintains its own "generation" counter, which is + // used to determine if it's up-to-date with its alias. The act of syncing a + // tensor will set a tensor's generation equal to its alias's generation. + bool is_up_to_date() const; + // Freezes the storage of this tensor, preventing subsequent mutations + void freeze_storage() const; + // Every FunctionalTensorWrapper contains a vector objects + // describing the series of view ops that ran to generate the current tensor + // from the base tensor. This method is used by inplace-view ops like + // transpose_. It appends a ViewMeta to the existing stack, and refreshes the + // tensor by replaying the views off of the alias. + void mutate_view_meta(const at::functionalization::ViewMeta& meta); + + // Custom implementation of self.set_(src) + void set__impl(const FunctionalTensorWrapper* other); + + // Returns whether the current tensor's data was ever mutated + bool has_data_mutation(); + // + // Returns whether the current FunctionalTensorWrapper + // experienced a set_() call. + bool was_storage_changed() { + return was_storage_changed_; + } + + // The functionalization pass can be used to remove mutations. + // It does so by replacing any mutation op with it's corresponding + // out-of-place op, followed by a call to replace_(). e.g: + // + // a.add_(1) + // + // will turn into: + // + // tmp = a.add(1) + // a.replace_(tmp) + // + // replace_() swaps out the wrapped tensor, value_, with tmp. + void replace_(const Tensor& other); + + bool is_multi_output_view() { + return is_multi_output_view_; + } + + // See Note[resize_() in functionalization pass] + void maybe_replace_storage(const Tensor& other); + + // Replaces the storage with a new functional storage, + // and clears the view_metas_ stack. + // WARNING: Calling this function will sever the aliasing relationship between + // the current FunctionalTensorWrapper and any of its outstanding aliases. + // Please only call if you know what you're doing. + void _unsafe_reset_storage(); + + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + + ~FunctionalTensorWrapper() override = default; + + // FunctionalTensorWrapper overrides all custom size/stride function, + // so that if the inner tensor has a custom implementation + // we make sure to call that implementation. + at::IntArrayRef sizes_custom() const override; + at::IntArrayRef strides_custom() const override; + int64_t dim_custom() const override; + int64_t numel_custom() const override; + bool is_contiguous_custom(at::MemoryFormat memory_format) const override; + c10::SymIntArrayRef sym_sizes_custom() const override; + c10::SymInt sym_size_custom(int64_t d) const override; + c10::SymIntArrayRef sym_strides_custom() const override; + c10::SymInt sym_storage_offset_custom() const override; + c10::Device device_custom() const override; + + private: + const char* tensorimpl_type_name() const override; + void set_constructor_metadata(); + functionalization::FunctionalStorageImpl* functional_storage_impl() const; + + // This is used to re-implement shallow_copy_and_detach for + // FunctionalTensorWrapper. The implementation is identical, but we just need + // to return a subclass instead of a plain TensorImpl. + // TODO: maybe it's possible to arrange for that to happen automatically + // without an override here? + template + c10::intrusive_ptr shallow_copy_and_detach_core( + VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const; + + void shallow_copy_from(const c10::intrusive_ptr& impl) override; + void copy_tensor_metadata_and_refresh( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const; + + // Note that value is not taken by reference: internally, the wrapper will + // change the value tensor that it points to over time. + Tensor value_; + int64_t level_{}; + // These two counters are used for identifying + // whether all the mutations on a given tensor are hidden from autograd or + // not. If we have an input mutation that is hidden from autograd, then once + // we convert the input mutation to a copy_() we know it will be safe to hide + // the copy_() from autograd as well. + uint64_t mutation_counter_ = 0; + uint64_t mutation_hidden_from_autograd_counter_ = 0; + uint64_t mutation_during_no_grad_or_inference_mode_ = 0; + bool has_metadata_mutation_ = false; + bool is_multi_output_view_ = false; + // Did the tensor experience a set_() call. + bool was_storage_changed_ = false; + + size_t generation_ = 0; + std::vector view_metas_; + + protected: + static void copy_tensor_metadata( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change); +}; + +// Utility functions for the functionalization pass. + +namespace functionalization { +namespace impl { + +TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper( + const Tensor& tensor) { + auto functional_impl = + static_cast(tensor.unsafeGetTensorImpl()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr); + return functional_impl; +} + +TORCH_API bool isFunctionalTensor(const at::Tensor& tensor); +TORCH_API bool isFunctionalTensor(const c10::optional& t); +TORCH_API bool isFunctionalTensor( + const c10::List>& t_list); +TORCH_API bool isFunctionalTensor(ITensorListRef list); + +TORCH_API Tensor to_functional_tensor(const Tensor& tensor); +TORCH_API c10::optional to_functional_tensor( + const c10::optional& tensor); +TORCH_API c10::List> to_functional_tensor( + const c10::List>& t_list); +TORCH_API std::vector to_functional_tensor(ITensorListRef t_list); + +TORCH_API void freeze_functional_tensor(const Tensor& tensor); + +TORCH_API Tensor +from_functional_tensor(const Tensor& tensor, bool assert_functional = true); +TORCH_API c10::optional from_functional_tensor( + const c10::optional& t, + bool assert_functional = true); +TORCH_API c10::List> from_functional_tensor( + const c10::List>& t_list); +TORCH_API std::vector from_functional_tensor(ITensorListRef t_list); + +TORCH_API void sync(const at::Tensor& t); +TORCH_API void sync(const c10::optional& t); +TORCH_API void sync(const c10::List>& t_list); +TORCH_API void sync(ITensorListRef t_list); + +TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other); +TORCH_API void replace_( + const ITensorListRef functional_tensor, + ITensorListRef other); + +TORCH_API void commit_update(const Tensor& functional_tensor); +TORCH_API void commit_update(ITensorListRef functional_tensor); + +TORCH_API void unsafe_reset_storage(const Tensor& functional_tensor); + +TORCH_API void mark_mutation_hidden_from_autograd( + const Tensor& functional_tensor); + +TORCH_API bool are_all_mutations_hidden_from_autograd( + const Tensor& functional_tensor); + +TORCH_API bool are_all_mutations_under_no_grad_or_inference_mode( + const Tensor& functional_tensor); + +// These two methods are XLA-specific logic and are no-ops +// for the normal functionalization flow. +TORCH_API void propagate_xla_data( + const Tensor& functional_tensor, + const Tensor& other); +TORCH_API void propagate_xla_data( + const ITensorListRef functional_tensor, + ITensorListRef other); + +Tensor create_functional_tensor_with_view_meta( + const Tensor& view_to_wrap, + const Tensor& base, + functionalization::ViewMeta meta, + int64_t out_idx = 0); +std::vector create_functional_tensor_with_view_meta( + ITensorListRef view_to_wrap, + const Tensor& base, + const functionalization::ViewMeta& meta); + +void mutate_view_meta( + const Tensor& self, + const functionalization::ViewMeta& meta); + +void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); +void set_sizes_strides_offset( + const std::vector& outs, + const std::vector& meta_outs); + +// ~~~~~ TLS used in functionalization ~~~~~ + +TORCH_API bool getFunctionalizationReapplyViewsTLS(); +TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views); + +class TORCH_API FunctionalizationReapplyViewsGuard { + public: + FunctionalizationReapplyViewsGuard(bool reapply_views) + : prev_(getFunctionalizationReapplyViewsTLS()) { + setFunctionalizationReapplyViewsTLS(reapply_views); + } + + ~FunctionalizationReapplyViewsGuard() { + setFunctionalizationReapplyViewsTLS(prev_); + } + + FunctionalizationReapplyViewsGuard( + const FunctionalizationReapplyViewsGuard&) = delete; + FunctionalizationReapplyViewsGuard operator=( + const FunctionalizationReapplyViewsGuard&) = delete; + FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) = + delete; + FunctionalizationReapplyViewsGuard operator=( + FunctionalizationReapplyViewsGuard&&) = delete; + + private: + bool prev_; +}; + +} // namespace impl + +// Helper function to call an out-of-place composite aten kernel that may use +// mutations / views internally, and functionalize them. +TORCH_API void functionalize_op_helper( + const c10::OperatorHandle& op, + torch::jit::Stack* stack); + +template +struct _functionalize_aten_op final {}; + +template +struct _functionalize_aten_op final { + static ReturnType call( + typename c10::maybe_keep_symint::type... args) { + using FuncType = ReturnType( + typename c10::maybe_keep_symint::type...); + auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow( + (const char*)Op::name, (const char*)Op::overload_name) + .typed(); + + return c10::impl::BoxedKernelWrapper::call( + c10::BoxedKernel::makeFromFunction(), + op, + // BoxedKernelWrapper knows to ignore this keyset argument, + // because functionalize_op_helper doesn't take in a DispatchKeySet + c10::DispatchKeySet(), + args...); + } +}; + +template +using functionalize_aten_op = + _functionalize_aten_op; + +template +using functionalize_aten_op_symint = + _functionalize_aten_op; + +} // namespace functionalization +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h new file mode 100644 index 0000000000000000000000000000000000000000..48c25e141dcb8c0264ca9435352889c7a250f74d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..beef24a6ed9c5b8373a0db5bcc16f268b8c18726 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h @@ -0,0 +1,25 @@ +#pragma once +#include +#include +#include + +namespace at { + +// If an operator doesn't have a batching rule implemented then we fallback +// to this implementation. The fallback only works on out-of-place operators +// that return only tensors with new memory. (e.g., no in-place operators, no +// view operations). +// +// The fallback effectively takes all of the BatchedTensors in `stack`, slices +// them, and runs `op` on all of the corresponding slices to produce slices +// of the outputs. The output slices then get `torch.stack`ed to create the +// final returns. +// +// The performance of the fallback is not very good because it introduces an +// extra copy from stacking the sliced outputs. Because of this, we prefer to +// write batching rules for operators whenever possible. +void batchedTensorForLoopFallback( + const c10::OperatorHandle& op, + torch::jit::Stack* stack); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..098fbf9d6292fdeb0fff5e3786b471b742540723 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h @@ -0,0 +1,160 @@ +#pragma once + +#include + +#include +#include +#include + +namespace at { + +// We assume this in a few other places in the codebase, +// but there isn't a centralized definition. +constexpr int64_t kVmapMaxTensorDims = 64; + +// The valid vmap levels range from [0, 64). This effectively means that we +// support a maximum of 64 nested vmaps. +constexpr int64_t kVmapNumLevels = 64; + +// Store this number of elements of BatchDims on the stack. Most people will +// probably use <= 5 nested vmaps, but adjust this number as necessary. +constexpr int64_t kBatchDimsStackSize = 5; + +// a BatchDim represents a "private" dimension on a Tensor created inside of +// vmap. It is a (level, dim) tuple, with the `dim` indicating which dimension +// is being vmap'ed over and the `level` being an identifier for which vmap +// said dimension was created inside. The `dim` corresponds to a "physical +// dim" - it is a dimension index on the underlying physical tensor that is +// being vmapped over. +struct BatchDim { + BatchDim(int64_t level, int64_t dim) : dim_(dim), level_(level) {} + int64_t dim() const { + return dim_; + } + int64_t level() const { + return level_; + } + + private: + int64_t dim_; + int64_t level_; +}; + +using BatchDims = SmallVector; +using BatchDimsRef = ArrayRef; + +// A BatchedTensorImpl holds an underlying Tensor and a list of BatchDim +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +// +// The batch dimensions are treated as being "private"; they are not +// user-visible. For example, in the following Tensor, +// bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2, dim=1)]) +// dimensions 0 and 1 are batch dimensions. +// +// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public) +// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) +// tensor. +struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { + explicit BatchedTensorImpl(Tensor value, BatchDims bdims); + + // Returns a reference to BatchDims that represent which dimensions of this + // tensor are private. + BatchDimsRef bdims() const { + return bdims_; + } + + // BatchedTensorImpl wraps a Tensor + const Tensor& value() const { + return value_; + }; + + // Given a public dimension index, return the dimension index in the + // underlying value() tensor. For example, if we have + // bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2, + // dim=2)]) + // bt.actualDim(0) -> 1 + // bt.actualDim(1) -> 3 + // bt.actualDim(2) -> Error + int64_t actualDim(int64_t dim, bool wrap_dim = true) const; + + // We have to override this because we opted into CustomStrides + IntArrayRef strides_custom() const override; + // Override a bunch of methods inherited from TensorImpl to return error + // messages. + bool is_contiguous_custom(at::MemoryFormat memory_format) const override; + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + void set_storage_offset(int64_t storage_offset) override; +#ifdef DEBUG + bool has_storage() const override; +#endif + + private: + // see NOTE: [BatchedTensorImpl levels invariant] + void checkInvariants() const; + const char* tensorimpl_type_name() const override; + + Tensor value_; + + // Note: [BatchedTensorImpl levels invariant] + // There is an invariant that the BatchDims must be stored in increasing + // `level` order. That is, for i < j, bdims_[i].level must be less than + // bdims_[j].level. + BatchDims bdims_; +}; + +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +inline bool isBatchedTensor(const Tensor& tensor) { + return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::Batched); +} + +// It is unsafe to call this on a Tensor that is not backed by a +// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible. +inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) { + return static_cast(tensor.unsafeGetTensorImpl()); +} + +inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) { + if (!isBatchedTensor(tensor)) { + return nullptr; + } + return unsafeGetBatchedImpl(tensor); +} + +// Returns a bitset. If bit i is set, then that means dim i is a batchdim. +inline std::bitset createBatchDimBitset( + BatchDimsRef bdims) { + std::bitset is_bdim; + for (const auto& bdim : bdims) { + is_bdim.set(bdim.dim()); + } + return is_bdim; +} + +// Creates a bitset for all of the levels present in `bdims` +inline std::bitset createVmapLevelsBitset(BatchDimsRef bdims) { + std::bitset result; + for (const auto& bdim : bdims) { + result.set(bdim.level()); + } + return result; +} + +inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) { + out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")"; + return out; +} + +// Use this to construct a BatchedTensor from a regular Tensor +TORCH_API Tensor makeBatched(const Tensor& tensor, BatchDims bdims); + +// Adds a batch dim to `tensor`, returning a BatchedTensor +TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t level, int64_t dim); + +// Checks if an inplace operation on self and other is "vmap compatible". +// See NOTE: [vmap-incompatible in-place operations] for the definition of this. +TORCH_API bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..f4a30edef623956d5072737336bfca6da5cb2bb4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include + +namespace at { + +enum MappedAllocatorModes { + ALLOCATOR_MAPPED_SHARED = 1, + ALLOCATOR_MAPPED_SHAREDMEM = 2, + ALLOCATOR_MAPPED_EXCLUSIVE = 4, + ALLOCATOR_MAPPED_NOCREATE = 8, + ALLOCATOR_MAPPED_KEEPFD = 16, + ALLOCATOR_MAPPED_FROMFD = 32, + ALLOCATOR_MAPPED_UNLINK = 64 +}; + +// Sentinel value/type to help distinguish the file descriptor constructor from +// the non-file descriptor constructor +enum WithFd { WITH_FD }; + +TORCH_API std::string NewProcessWideShmHandle(); + +class TORCH_API MapAllocator { + public: + MapAllocator(c10::string_view filename, int flags, size_t size); + MapAllocator( + WithFd, + c10::string_view filename, + int fd, + int flags, + size_t size); + MapAllocator(const MapAllocator&) = delete; + MapAllocator& operator=(const MapAllocator&) = delete; + MapAllocator(MapAllocator&&) = delete; + MapAllocator& operator=(MapAllocator&&) = delete; + + const char* filename() const { + return filename_.c_str(); + } + int fd() const { +#ifdef _WIN32 + TORCH_CHECK(false, "MapAllocator::fd() is unsupported on Windows"); +#else + return fd_; +#endif + } + ptrdiff_t size() const { + return size_; + } + // Return a pointer to the actual data for this allocator + // (in the case of the refcounted allocator, this is offset + // from the base pointer.) + virtual void* data() const { + return base_ptr_; + } + + static MapAllocator* fromDataPtr(const at::DataPtr&); + static at::DataPtr makeDataPtr( + c10::string_view filename, + int flags, + size_t size, + size_t* actual_size_out); + static at::DataPtr makeDataPtr( + WithFd, + const char* filename, + int fd, + int flags, + size_t size, + size_t* actual_size_out); + + // Closes the data. Helps us avoid destructor shenanigans + virtual void close(); + + // This is very dangerous. You have to redefine this destructor for each + // subclass + virtual ~MapAllocator(); + + protected: + bool closed_ = false; + std::string filename_; + int flags_ = 0; + ptrdiff_t size_; /* mapped size */ +#ifdef _WIN32 + void* handle_; + void* event_; + std::string eventname_; +#else + int fd_ = -1; +#endif + void* base_ptr_ = nullptr; +}; + +// Base-from-member idiom +struct TORCH_API RefcountedMapAllocatorArgCheck { + RefcountedMapAllocatorArgCheck(int flags); +}; + +class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, + public MapAllocator { + public: + RefcountedMapAllocator(const char* filename, int flags, size_t size); + RefcountedMapAllocator( + WithFd, + const char* filename, + int fd, + int flags, + size_t size); + + static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&); + static at::DataPtr makeDataPtr( + const char* filename, + int flags, + size_t size, + size_t* actual_size_out); + static at::DataPtr makeDataPtr( + WithFd, + const char* filename, + int fd, + int flags, + size_t size, + size_t* actual_size_out); + + void* data() const override; + + void incref(); + int decref(); + void close() override; + + ~RefcountedMapAllocator() override { + RefcountedMapAllocator::close(); + } + + protected: + void checkFlags(); + void initializeAlloc(); +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..a7606b0a668a43800b89755af1371551909b23d5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h @@ -0,0 +1 @@ +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..0ad42ae816274117a276a9545e2057b5eb252a6c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h @@ -0,0 +1,283 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { +struct NestedTensorImpl; +inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt); +int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor); + +struct TORCH_API NestedTensorImpl : public c10::TensorImpl { + explicit NestedTensorImpl( + Storage storage, + c10::DispatchKeySet key_set, + const caffe2::TypeMeta data_type, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets); + + explicit NestedTensorImpl( + const at::Tensor& buffer, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets); + // assume contiguous, `nested_strides` and `offsets` + // can be infered from `nested_sizes` + explicit NestedTensorImpl( + const at::Tensor& buffer, + const at::Tensor& nested_sizes); + + // This constructor is used creating view tensors from nested tensors + explicit NestedTensorImpl( + c10::TensorImpl::ImplType impl_type, + const at::Tensor& base_tensor, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets); + + // TODO: don't expose private implementation details like this; in + // particular, resizing this tensor will mess up our dim() and + // callers cannot fix it. + const Tensor& get_nested_sizes() const { + return nested_sizes_; + } + // TODO: don't expose private implementation details like this + const Tensor& get_nested_strides() const { + return nested_strides_; + } + const Tensor& get_storage_offsets() const { + return storage_offsets_; + } + // Returns nullopt if the ith dimension is irregular. The ith dimension + // of a NestedTensor is regular if the unbound tensors match in + // size at the (i-1)th dimension. + c10::optional opt_size(int64_t d) const; + + int64_t size(int64_t d) const { + c10::optional optional_size = this->opt_size(d); + TORCH_CHECK( + optional_size.has_value(), + "Given dimension ", + d, + " is irregular and does not have a size."); + return *optional_size; + } + /** + * Return a view of the nested tensor as a 1 dimensional contiguous tensor. + * + * The buffer tensor created by this function shares the same storage_impl as + * the original nested tensor, and therefore can be seen as a view. + * + * @return A newly constructed view tensor + */ + at::Tensor get_buffer() const { + TORCH_CHECK( + nested_tensor_impl_is_contiguous(this), + "NestedTensor must be contiguous to get buffer."); + return get_unsafe_storage_as_tensor(); + } + /** + * If possible use get_buffer() instead. This function returns the storage + * as a tensor directly, which is not safe to use in general. If using this + * function, The caller must ensure to account for nested_sizes, + * nested_strides and storage_offsets. + * + * @return A newly constructed view tensor + */ + at::Tensor get_unsafe_storage_as_tensor() const { + auto buffer_key_set_ = generate_buffer_key_set(); + const auto buffer_size = get_buffer_size(); + auto buffer_tensor_impl = c10::make_intrusive( + c10::TensorImpl::VIEW, Storage(storage_), buffer_key_set_, data_type_); + buffer_tensor_impl->set_sizes_contiguous( + c10::makeArrayRef(static_cast(buffer_size))); + return Tensor(buffer_tensor_impl); + } + + size_t get_buffer_size() const { + return storage_.nbytes() / data_type_.itemsize(); + } + + protected: + const char* tensorimpl_type_name() const override; + + // TODO: numel_custom and is_contiguous_custom can be profitably overridden + // with real implementations + int64_t numel_custom() const override; + c10::SymInt sym_numel_custom() const override; + bool is_contiguous_custom(MemoryFormat) const override; + int64_t size_custom(int64_t d) const override { + return this->size(d); + } + c10::SymInt sym_size_custom(int64_t d) const override { + return c10::SymInt{this->size(d)}; + } + IntArrayRef sizes_custom() const override; + c10::SymIntArrayRef sym_sizes_custom() const override; + IntArrayRef strides_custom() const override; + c10::SymIntArrayRef sym_strides_custom() const override; + + // this one is real + int64_t dim_custom() const override; + + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + + void shallow_copy_from(const c10::intrusive_ptr& impl) override { + copy_tensor_metadata( + /*src_impl=*/impl.get(), + /*dest_impl=*/this, + /*version_counter=*/version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); + } + + private: + // Must be called after any changes to our dim() to sync the state + // to TensorImpl. + void refresh_dim(); + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const at::Tensor nested_sizes_, nested_strides_; + // The starting positions of the underlying tensors in contiguous buffer + // i.e. the buffer memory offsets to get the underlying tensors + // The reason to keep this metadata is that, without strong enough constraint + // it cannot be derived from `nested_sizes_` + // and `nested_strides_`: + // 1. when buffer has blanks, e.g. [tensor1, blank, tensor2] + // this can happen e.g. after slicing a nested tensor + // 2. when multiple tensors share a same memory + // 3. when the nesting ordering is changed, e.g. [tensor1, tensor3, tensor2] + // Some strong enough constraints are: + // 1. every underlying tensor is contiguous in memory + // && nesting in ascending order + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const at::Tensor storage_offsets_; + // NOTE: -1 here means the size is missing + // Optional to allow it to be computed lazily from nested. + // TODO: maybe we can remove this metadata since + // we can compute it from `nested_sizes_` + mutable c10::optional> opt_sizes_; + + template + c10::intrusive_ptr shallow_copy_and_detach_core( + VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const; + + /** + * Generates a non-nested key_set from a nested tensor. + * + * For many nested tensor kernel implementations a buffer tensor + * is generated and redispatched to a non-nested kernel this function + * generates the key set used by that buffer tensor + * + * @return Appropriate key set for non-nested tensor + */ + inline c10::DispatchKeySet generate_buffer_key_set() const { + auto buffer_key_set = this->key_set(); + const bool Autograd = buffer_key_set.has_any(c10::autograd_dispatch_keyset); + // Remove nested tensor specific keys + buffer_key_set = buffer_key_set - + c10::DispatchKeySet{ + c10::DispatchKey::NestedTensor, + c10::DispatchKey::AutogradNestedTensor}; + + // Add dense tensor specific keys + buffer_key_set = + buffer_key_set | c10::DispatchKeySet{c10::DispatchKey::Dense}; + buffer_key_set = Autograd + ? c10::DispatchKeySet{c10::DispatchKey::Autograd} | buffer_key_set + : buffer_key_set; + + return buffer_key_set; + } +}; + +inline NestedTensorImpl* get_nested_tensor_impl_or_null( + const at::Tensor& tensor) { + if (tensor.is_nested()) { + return static_cast(tensor.unsafeGetTensorImpl()); + } + return nullptr; +} + +inline NestedTensorImpl* get_nested_tensor_impl(const at::Tensor& tensor) { + TORCH_CHECK( + tensor.is_nested(), "get_nested_tensor_impl requires a NestedTensor."); + return static_cast(tensor.unsafeGetTensorImpl()); +} + +inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) { + int64_t ntensors = nt->size(0); + if (ntensors == 0) { + return true; + } + const Tensor &sizemat = nt->get_nested_sizes(), + &stridemat = nt->get_nested_strides(); + int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr(); + int64_t orig_dim = sizemat.size(1); + // nesting scalars + if (orig_dim == 0) { + // each scalar must be contiguous + // if there is blank memory between underlying scalars + for (int64_t i = 0; i < ntensors; i++) { + if (offsets_ptr[i] != i) { + return false; + } + } + } + // nesting tensors + else { + // if any underlying tensor is non-contiguous + const int64_t *sizemat_ptr = sizemat.data_ptr(), + *stridemat_ptr = stridemat.data_ptr(); + for (int64_t i = 0; i < ntensors; i++) { + if (stridemat_ptr[orig_dim - 1] != 1) { + return false; + } + int64_t product = sizemat_ptr[orig_dim - 1]; + for (int64_t j = orig_dim - 2; j >= 0; j--) { + if (stridemat_ptr[j] != product) { + return false; + } + product *= sizemat_ptr[j]; + } + sizemat_ptr += orig_dim; + stridemat_ptr += orig_dim; + } + // if there is blank memory between underlying tensors + if (offsets_ptr[0] != 0) { + return false; + } + sizemat_ptr = sizemat.data_ptr(); + stridemat_ptr = stridemat.data_ptr(); + for (int64_t i = 1; i < ntensors; i++) { + if (offsets_ptr[i] != + offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) { + return false; + } + sizemat_ptr += orig_dim; + stridemat_ptr += orig_dim; + } + } + // everything is fine + return true; +} + +inline const at::Tensor& get_nested_sizes(const at::Tensor& tensor) { + return get_nested_tensor_impl(tensor)->get_nested_sizes(); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h new file mode 100644 index 0000000000000000000000000000000000000000..573d1a7b88ab7367858df90b6adfebfa9b97d5e9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h @@ -0,0 +1,28 @@ +#pragma once +#include +#include + +namespace at { + +enum class padding_mode { + reflect, + replicate, + circular, + constant, +}; + +static inline c10::string_view padding_mode_string(padding_mode m) { + switch (m) { + case padding_mode::reflect: + return "reflect"; + case padding_mode::replicate: + return "replicate"; + case padding_mode::circular: + return "circular"; + case padding_mode::constant: + return "constant"; + } + TORCH_CHECK(false, "Invalid padding mode (", static_cast(m), ")"); +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..a5e682281abe52215288d53fc5c9552d3dd2d483 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include + +namespace at { + +template +inline void parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const F& f) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grain_size >= 0); + if (begin >= end) { + return; + } + +#ifdef INTRA_OP_PARALLEL + at::internal::lazy_init_num_threads(); + const auto numiter = end - begin; + const bool use_parallel = + (numiter > grain_size && numiter > 1 && !at::in_parallel_region() && + at::get_num_threads() > 1); + if (!use_parallel) { + internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); + f(begin, end); + return; + } + + internal::invoke_parallel( + begin, end, grain_size, [&](int64_t begin, int64_t end) { + c10::ParallelGuard guard(true); + f(begin, end); + }); +#else + internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); + f(begin, end); +#endif +} + +template +inline scalar_t parallel_reduce( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const scalar_t ident, + const F& f, + const SF& sf) { + TORCH_CHECK(grain_size >= 0); + if (begin >= end) { + return ident; + } + +#ifdef INTRA_OP_PARALLEL + at::internal::lazy_init_num_threads(); + const auto max_threads = at::get_num_threads(); + const bool use_parallel = + ((end - begin) > grain_size && !at::in_parallel_region() && + max_threads > 1); + if (!use_parallel) { + internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); + return f(begin, end, ident); + } + + c10::SmallVector results(max_threads, ident); + internal::invoke_parallel( + begin, + end, + grain_size, + [&](const int64_t my_begin, const int64_t my_end) { + const auto tid = at::get_thread_num(); + c10::ParallelGuard guard(true); + results[tid] = f(my_begin, my_end, ident); + }); + + scalar_t result = ident; + for (auto partial_result : results) { + result = sf(result, partial_result); + } + return result; +#else + internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); + return f(begin, end, ident); +#endif +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h new file mode 100644 index 0000000000000000000000000000000000000000..ff14f568d22a6e0d319bedb4e68194cd0971259e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h @@ -0,0 +1,160 @@ +#pragma once +#include +#include +#include +#include + +namespace at { + +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +// Called during new thread initialization +TORCH_API void init_num_threads(); + +// Sets the number of threads to be used in parallel region +TORCH_API void set_num_threads(int); + +// Returns the maximum number of threads that may be used in a parallel region +TORCH_API int get_num_threads(); + +// Returns the current thread number (starting from 0) +// in the current parallel region, or 0 in the sequential region +TORCH_API int get_thread_num(); + +// Checks whether the code runs in parallel region +TORCH_API bool in_parallel_region(); + +namespace internal { + +// Initialise num_threads lazily at first parallel call +inline void lazy_init_num_threads() { + thread_local bool init = false; + if (C10_UNLIKELY(!init)) { + at::init_num_threads(); + init = true; + } +} + +TORCH_API void set_thread_num(int); + +class TORCH_API ThreadIdGuard { + public: + ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) { + set_thread_num(new_id); + } + + ~ThreadIdGuard() { + set_thread_num(old_id_); + } + + private: + int old_id_; +}; + +} // namespace internal + +/* +parallel_for + +begin: index at which to start applying user function + +end: index at which to stop applying user function + +grain_size: number of elements per chunk. impacts the degree of parallelization + +f: user function applied in parallel to the chunks, signature: + void f(int64_t begin, int64_t end) + +Warning: parallel_for does NOT copy thread local +states from the current thread to the worker threads. +This means for example that Tensor operations CANNOT be used in the +body of your function, only data pointers. +*/ +template +inline void parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const F& f); + +/* +parallel_reduce + +begin: index at which to start applying reduction + +end: index at which to stop applying reduction + +grain_size: number of elements per chunk. impacts number of elements in +intermediate results tensor and degree of parallelization. + +ident: identity for binary combination function sf. sf(ident, x) needs to return +x. + +f: function for reduction over a chunk. f needs to be of signature scalar_t +f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy) + +sf: function to combine two partial results. sf needs to be of signature +scalar_t sf(scalar_t x, scalar_t y) + +For example, you might have a tensor of 10000 entires and want to sum together +all the elements. Parallel_reduce with a grain_size of 2500 will then allocate +an intermediate result tensor with 4 elements. Then it will execute the function +"f" you provide and pass the beginning and end index of these chunks, so +0-2499, 2500-4999, etc. and the combination identity. It will then write out +the result from each of these chunks into the intermediate result tensor. After +that it'll reduce the partial results from each chunk into a single number using +the combination function sf and the identity ident. For a total summation this +would be "+" and 0 respectively. This is similar to tbb's approach [1], where +you need to provide a function to accumulate a subrange, a function to combine +two partial results and an identity. + +Warning: parallel_reduce does NOT copy thread local +states from the current thread to the worker threads. +This means for example that Tensor operations CANNOT be used in the +body of your function, only data pointers. + +[1] https://software.intel.com/en-us/node/506154 +*/ +template +inline scalar_t parallel_reduce( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const scalar_t ident, + const F& f, + const SF& sf); + +// Returns a detailed string describing parallelization settings +TORCH_API std::string get_parallel_info(); + +// Sets number of threads used for inter-op parallelism +TORCH_API void set_num_interop_threads(int); + +// Returns the number of threads used for inter-op parallelism +TORCH_API int get_num_interop_threads(); + +// Launches inter-op parallel task +TORCH_API void launch(std::function func); +namespace internal { +void launch_no_thread_state(std::function fn); +} // namespace internal + +// Launches intra-op parallel task +TORCH_API void intraop_launch(std::function func); + +// Returns number of intra-op threads used by default +TORCH_API int intraop_default_num_threads(); + +} // namespace at + +#if AT_PARALLEL_OPENMP +#include // IWYU pragma: keep +#elif AT_PARALLEL_NATIVE +#include // IWYU pragma: keep +#elif AT_PARALLEL_NATIVE_TBB +#include // IWYU pragma: keep +#endif + +#include // IWYU pragma: keep diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h new file mode 100644 index 0000000000000000000000000000000000000000..8df093a99065f3a02490d5ec7747112454b6b44b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include + +#include + +#define INTRA_OP_PARALLEL + +namespace at::internal { + +TORCH_API void invoke_parallel( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +} // namespace at::internal diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..af821cb908c6a6675d21c3543809f9668ffccb64 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at { + +namespace impl { + +struct TORCH_API SavedTensorDefaultHooksTLS { + // PyObject is defined in c10/util/python_stub.h + std::stack> stack; + + // See NOTE: [Disabling SavedTensorDefaultHooks] for context + // NOTE: [disabled_error_message invariant] + // disabled_error_message is nullopt IFF Saved Tensor hooks is enabled + // We did this for efficiency (so we didn't have to keep a separate bool + // around) + c10::optional disabled_error_message; +}; + +} // namespace impl + +struct TORCH_API SavedTensorDefaultHooks { + static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook); + static void pop_hooks(); + static std::pair get_hooks(); + static void lazy_initialize(); + static std::stack> get_stack(); + static void set_stack(std::stack>); + + static const impl::SavedTensorDefaultHooksTLS& get_tls_state(); + static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls); + + // NOTE: [Disabling SavedTensorDefaultHooks] + // A developer of a PyTorch feature may choose to disable SavedTensorDefault + // hooks, especially if their feature does not work with it. If they are + // disabled, then the following will raise an error: + // - Attempting to push_hooks + // - calling disable(message) with a non-zero stack (from get_stack) size + static void disable(const std::string& error_message); + static void enable(); + static bool is_enabled(); + static const c10::optional& get_disabled_error_message(); +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..528ed7b8762be5f681c759a5ce8a90aa8d4225d7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h new file mode 100644 index 0000000000000000000000000000000000000000..ec0cb6c8fdfcb2a36139035340d75d96a7930dfc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h @@ -0,0 +1,72 @@ +#pragma once +#include +#include +#include + +namespace at { + +struct DimCounter { + DimCounter(IntArrayRef shape, Range range); + + void increment(const std::array& step); + bool is_done() const; + std::array max_2d_step() const; + + IntArrayRef shape; + Range range; + c10::SmallBuffer values; + int64_t offset; +}; + +namespace internal { + +inline void get_data_ptrs( + char** ptrs, + ArrayRef base, + IntArrayRef strides, + IntArrayRef counter) { + const auto ntensors = base.size(); + const auto ndim = counter.size(); + std::copy(base.begin(), base.end(), ptrs); + for (const auto dim : c10::irange(ndim)) { + int64_t value = counter[dim]; + for (const auto arg : c10::irange(ntensors)) { + ptrs[arg] += value * strides[dim * ntensors + arg]; + } + } +} + +inline void serial_for_each( + IntArrayRef shape, + IntArrayRef strides, + char** base_ptrs, + size_t ntensors, + typename TensorIteratorBase::loop2d_t loop, + Range range) { + const auto ndim = shape.size(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + strides.size() == ntensors * std::max(size_t{2}, ndim)); + + if (ndim <= 1) { + if (range.begin == 0) { + loop(base_ptrs, strides.data(), range.size(), 1); + } else { + c10::SmallBuffer ptrs(ntensors); + get_data_ptrs(ptrs.data(), {base_ptrs, ntensors}, strides, {range.begin}); + loop(ptrs.data(), strides.data(), range.size(), 1); + } + } else { + c10::SmallBuffer ptrs(ntensors); + auto counter = DimCounter(shape, range); + while (!counter.is_done()) { + get_data_ptrs( + ptrs.data(), {base_ptrs, ntensors}, strides, counter.values); + auto step = counter.max_2d_step(); + loop(ptrs.data(), strides.data(), step[0], step[1]); + counter.increment(step); + } + } +} + +} // namespace internal +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h new file mode 100644 index 0000000000000000000000000000000000000000..8c5003a676d80fea79e7facab42a2818d9e2aa74 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { + +class Tensor; + +namespace impl { + +// Use this to define the prototype for a meta function. There are two +// versions; one that takes one argument (just the operator name), or FUNC2 +// variant that takes two arguments (operator name and overload name). +// +// Example usage: +// +// TORCH_META_FUNC2(add, Tensor) ( +// const Tensor& self, const Tensor& other +// ) { +// ... compute sizes and options ... +// set_output(sizes, options); +// } +// +#define TORCH_META_FUNC(name) void structured_##name::meta +#define TORCH_META_FUNC2(name, overload) \ + void structured_##name##_##overload::meta + +// These are versions of TORCH_META_FUNC(2) that include a precompute_out struct +// as a return value. They should be used when the kernel in question has +// precomputed values declared in native_functions.yaml and the corresponding +// implementation should return an instance of the aforementioned struct. +#define TORCH_PRECOMPUTE_META_FUNC(name) \ + structured_##name::meta_return_ty structured_##name::meta +#define TORCH_PRECOMPUTE_META_FUNC2(name, overload) \ + structured_##name##_##overload::meta_return_ty \ + structured_##name##_##overload::meta + +// Use this to create a precompute struct in a meta function. +#define TORCH_PRECOMPUTE_STRUCT(name) structured_##name::precompute_out<> +#define TORCH_PRECOMPUTE_STRUCT2(name, overload) \ + structured_##name##_##overload::precompute_out<> + +// Use this to define the prototype for an implementation. This takes only +// one argument, which is the name of the dispatch key entry you're +// implementing. +// +// Example usage: +// +// TORCH_IMPL_FUNC(add_cpu) ( +// Tensor& result, const Tensor& self, const Tensor& other +// ) { +// ... do the actual implementation ... +// } +// +#define TORCH_IMPL_FUNC(name) void structured_##name::impl + +// Base class for all structured kernel classes. The set_output virtual +// method is varied depending whether or not the operator is +// functional/out/inplace, and could also be specialized for CPU/CUDA/etc +// (although presently it isn't). +// +// A notable subclass of this interface is TensorIteratorBase. +struct TORCH_API MetaBase { + MetaBase() = default; + MetaBase(const MetaBase&) = default; + MetaBase& operator=(const MetaBase&) = default; + MetaBase(MetaBase&&) noexcept = default; + MetaBase& operator=(MetaBase&&) noexcept = default; + virtual const Tensor& maybe_get_output(int64_t output_idx) = 0; + + // Note: [set_output_*] + // See: https://github.com/pytorch/pytorch/issues/69813 + // Whenever defining the output properties in the META function of a + // structured kernel (what was usually done with `set_output`), use one of + // these 3 variants, instead. In order to decide which variant to use, check + // the following decision tree: + // + // - Can the kernel you are going to implement support output tensors + // with arbitrary strides? + // | + // -- YES: `set_output_raw_strided` + // | + // -- NO: Should the output tensor strides be contiguous? + // | + // -- YES: `set_output_contiguous` + // | + // -- NO: `set_output_strided` + // + // Use this function whenever the kernel requires specific strides for the + // output. If `strides` does not match the given output strides, proxy outputs + // will be created and passed to the IMPL function. + virtual void set_output_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options, + DimnameList names = {}) { + TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented."); + } + + // Use this function whenever the kernel knows how to handle arbitrary strided + // outputs. This function has the same behavior as the old `set_output`: it + // will only re-stride if the given output was resized. + virtual void set_output_raw_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides_hint, + TensorOptions options, + DimnameList names = {}) { + TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented."); + } + + // Use this function if the kernel requires contiguous strides. + // Alias for `set_output_strided`, but with contiguous strides. + void set_output_contiguous( + int64_t output_idx, + IntArrayRef sizes, + TensorOptions options, + DimnameList names = {}) { + auto strides = c10::contiguous_strides(sizes); + set_output_strided(output_idx, sizes, strides, options, names); + } + + // Returns a reference to an undefined tensor if there is no presupplied + // output + const Tensor& maybe_get_output() { + return maybe_get_output(0); + } + virtual ~MetaBase() = default; +}; + +} // namespace impl + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h new file mode 100644 index 0000000000000000000000000000000000000000..616efc14d2599d7f1a9f73f04fdf960e05bfcf2e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h @@ -0,0 +1,75 @@ +#pragma once + +#include + +namespace at::namedinference { + +// TensorName and TensorNames are wrappers around Dimname and DimnameList +// that contain helper functions to make writing name inference rules easier. +// +// A TensorName represents a Dimname associated with some DimnameList (from a +// Tensor). This encapsulates all the information that is needed to check if +// names *match* and to *unify* names. +// +// Definition: Two names in two tensors *match* if they are equal, or if at +// least one of them is a wildcard that can be *refined* to the other name. +// +// Definition: unify(name, other) fails if the names do not match. Otherwise, +// it returns the most refined of name and other. +// +// Here is an example of checking if two names match. +// tensor: Tensor[A, None] +// other: Tensor[A] +// +// Let's say we wish to check if tensor.names[-1] matches other.names[-1]. +// None (in tensor) cannot match A (in other) because if the None were refined +// to A, `tensor` would have duplicate names [A, A]. Therefore we need to check +// tensor.names [A, None] for the existence of A. +struct TORCH_API TensorName { + explicit TensorName(ArrayRef origin, int origin_idx) + : origin_(origin), + name_(origin[maybe_wrap_dim( + origin_idx, + static_cast(origin.size()))]), + origin_idx_(origin_idx) {} + + // op_name is only used for error reporting. + const TensorName& unify(const TensorName& other, const char* op_name) const; + Dimname toDimname() const; + + private: + ArrayRef origin_; + Dimname name_; + int origin_idx_; // A named tensor can have at most 64 dims. + + TORCH_API friend std::ostream& operator<<( + std::ostream& out, + const TensorName& tensorname); +}; + +using TensorNameVec = SmallVector; + +struct TORCH_API TensorNames { + explicit TensorNames(ArrayRef names); + + // Create TensorNames from names[start:end]. Each individual TensorName stores + // `names`, NOT names[start:end], because the original tensor's names are + // `names`. + explicit TensorNames(ArrayRef names, int64_t start, int64_t end); + + // op_name is only used for error reporting. + TensorNames& unifyFromRightInplace( + const TensorNames& other, + const char* op_name = "unify"); + void checkUnique(const char* op_name) const; + + void append(TensorName name); + std::vector toDimnameVec() const; + + private: + explicit TensorNames(TensorNameVec&& names) : names_(std::move(names)){}; + + TensorNameVec names_; +}; + +} // namespace at::namedinference diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h new file mode 100644 index 0000000000000000000000000000000000000000..7567af4cbfe466843b1d48d78ffd259035cd62dc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at { + +#define AT_FORALL_BINARY_OPS(_) \ + _(+, x.add(y), y.add(x)) \ + _(*, x.mul(y), y.mul(x)) \ + _(-, \ + x.sub(y), \ + ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).sub_(y)) \ + _(/, \ + x.div(y), \ + ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).div_(y)) \ + _(%, \ + x.remainder(y), \ + ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).remainder_(y)) \ + _(&, x.bitwise_and(y), y.bitwise_and(x)) \ + _(|, x.bitwise_or(y), y.bitwise_or(x)) \ + _(^, x.bitwise_xor(y), y.bitwise_xor(x)) \ + _(<, x.lt(y), y.gt(x)) \ + _(<=, x.le(y), y.ge(x)) \ + _(>, x.gt(y), y.lt(x)) \ + _(>=, x.ge(y), y.le(x)) \ + _(==, x.eq(y), y.eq(x)) \ + _(!=, x.ne(y), y.ne(x)) + +#define DEFINE_OPERATOR(op, body, reverse_scalar_body) \ + static inline Tensor operator op(const Tensor& x, const Tensor& y) { \ + return body; \ + } \ + static inline Tensor operator op(const Tensor& x, const Scalar& y) { \ + return body; \ + } \ + static inline Tensor operator op(const Scalar& x, const Tensor& y) { \ + return reverse_scalar_body; \ + } + +AT_FORALL_BINARY_OPS(DEFINE_OPERATOR) +#undef DEFINE_OPERATOR +#undef AT_FORALL_BINARY_OPS + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..b3edba8efdf726cea92059cb01e34ee25206482c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..4615ab50606ee963c48c5673c93196b4188aa9b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h @@ -0,0 +1,190 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +// These functions are NOT in Utils.h, because this file has a dep on Tensor.h + +#define TORCH_CHECK_TENSOR_ALL(cond, ...) \ + TORCH_CHECK((cond)._is_all_true().item(), __VA_ARGS__); + +namespace at { + +// The following are utility functions for checking that arguments +// make sense. These are particularly useful for native functions, +// which do NO argument checking by default. + +struct TORCH_API TensorArg { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const Tensor& tensor; + const char* name; + int pos; // 1-indexed + TensorArg(const Tensor& tensor, const char* name, int pos) + : tensor(tensor), name(name), pos(pos) {} + // Try to mitigate any possibility of dangling reference to temporaries. + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) + TensorArg(Tensor&& tensor, const char* name, int pos) = delete; + const Tensor* operator->() const { + return &tensor; + } + const Tensor& operator*() const { + return tensor; + } +}; + +struct TORCH_API TensorGeometryArg { + TensorGeometry tensor; + const char* name; + int pos; // 1-indexed + /* implicit */ TensorGeometryArg(TensorArg arg) + : tensor(TensorGeometry{arg.tensor}), name(arg.name), pos(arg.pos) {} + TensorGeometryArg(TensorGeometry tensor, const char* name, int pos) + : tensor(std::move(tensor)), name(name), pos(pos) {} + const TensorGeometry* operator->() const { + return &tensor; + } + const TensorGeometry& operator*() const { + return tensor; + } +}; + +// A string describing which function did checks on its input +// arguments. +// TODO: Consider generalizing this into a call stack. +using CheckedFrom = const char*; + +// The undefined convention: singular operators assume their arguments +// are defined, but functions which take multiple tensors will +// implicitly filter out undefined tensors (to make it easier to perform +// tests which should apply if the tensor is defined, and should not +// otherwise.) +// +// NB: This means that the n-ary operators take lists of TensorArg, +// not TensorGeometryArg, because the Tensor to TensorGeometry +// conversion will blow up if you have undefined tensors. + +TORCH_API std::ostream& operator<<( + std::ostream& out, + const TensorGeometryArg& t); +TORCH_API void checkDim( + CheckedFrom c, + const Tensor& tensor, + const char* name, + int pos, // 1-indexed + int64_t dim); +TORCH_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim); +// NB: this is an inclusive-exclusive range +TORCH_API void checkDimRange( + CheckedFrom c, + const TensorGeometryArg& t, + int64_t dim_start, + int64_t dim_end); +TORCH_API void checkSameDim( + CheckedFrom c, + const TensorGeometryArg& t1, + const TensorGeometryArg& t2); +TORCH_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t); +TORCH_API void checkAllContiguous(CheckedFrom c, at::ArrayRef ts); +TORCH_API void checkSize( + CheckedFrom c, + const TensorGeometryArg& t, + IntArrayRef sizes); +TORCH_API void checkSize_symint( + CheckedFrom c, + const TensorGeometryArg& t, + c10::SymIntArrayRef sizes); +TORCH_API void checkSize( + CheckedFrom c, + const TensorGeometryArg& t, + int64_t dim, + int64_t size); +TORCH_API void checkSize_symint( + CheckedFrom c, + const TensorGeometryArg& t, + int64_t dim, + const c10::SymInt& size); +TORCH_API void checkNumel( + CheckedFrom c, + const TensorGeometryArg& t, + int64_t numel); +TORCH_API void checkSameNumel( + CheckedFrom c, + const TensorArg& t1, + const TensorArg& t2); +TORCH_API void checkAllSameNumel(CheckedFrom c, ArrayRef tensors); +TORCH_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s); +TORCH_API void checkScalarTypes( + CheckedFrom c, + const TensorArg& t, + at::ArrayRef l); +TORCH_API void checkSameGPU( + CheckedFrom c, + const TensorArg& t1, + const TensorArg& t2); +TORCH_API void checkAllSameGPU(CheckedFrom c, ArrayRef tensors); +TORCH_API void checkSameType( + CheckedFrom c, + const TensorArg& t1, + const TensorArg& t2); +TORCH_API void checkAllSameType(CheckedFrom c, ArrayRef tensors); +TORCH_API void checkSameSize( + CheckedFrom c, + const TensorArg& t1, + const TensorArg& t2); +TORCH_API void checkAllSameSize(CheckedFrom c, ArrayRef tensors); +TORCH_API void checkDefined(CheckedFrom c, const TensorArg& t); +TORCH_API void checkAllDefined(CheckedFrom c, at::ArrayRef t); + +// FixMe: does TensorArg slow things down? +TORCH_API void checkBackend( + CheckedFrom c, + at::ArrayRef t, + at::Backend backend); + +TORCH_API void checkDeviceType( + CheckedFrom c, + at::ArrayRef tensors, + at::DeviceType device_type); + +TORCH_API void checkLayout(CheckedFrom c, const Tensor& t, Layout layout); + +TORCH_API void checkLayout( + CheckedFrom c, + at::ArrayRef tensors, + at::Layout layout); + +// Methods for getting data_ptr if tensor is defined +TORCH_API void* maybe_data_ptr(const Tensor& tensor); +TORCH_API void* maybe_data_ptr(const TensorArg& tensor); + +TORCH_API void check_dim_size( + const Tensor& tensor, + int64_t dim, + int64_t dim_size, + int64_t size); + +namespace detail { +TORCH_API std::vector defaultStrides(IntArrayRef sizes); + +TORCH_API c10::optional> computeStride( + IntArrayRef oldshape, + IntArrayRef oldstride, + IntArrayRef newshape); + +TORCH_API c10::optional computeStride( + c10::SymIntArrayRef oldshape, + c10::SymIntArrayRef oldstride, + c10::SymIntArrayRef newshape); + +TORCH_API c10::optional computeStride( + IntArrayRef oldshape, + IntArrayRef oldstride, + const DimVector& newshape); + +} // namespace detail +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..17826b332afbcf9f2e2a328ed3a938cc6fecce74 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h @@ -0,0 +1,138 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete + +namespace at { + +TORCH_API int _crash_if_asan(int); + +// Converts a TensorList (i.e. ArrayRef to vector of TensorImpl*) +// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat. +// Once cat is ported entirely to ATen this can be deleted! +static inline std::vector checked_dense_tensor_list_unwrap( + ArrayRef tensors, + const char* name, + int pos, + c10::DeviceType device_type, + ScalarType scalar_type) { + std::vector unwrapped; + unwrapped.reserve(tensors.size()); + for (const auto i : c10::irange(tensors.size())) { + const auto& expr = tensors[i]; + if (expr.layout() != Layout::Strided) { + AT_ERROR( + "Expected dense tensor but got ", + expr.layout(), + " for sequence element ", + i, + " in sequence argument at position #", + pos, + " '", + name, + "'"); + } + if (expr.device().type() != device_type) { + AT_ERROR( + "Expected object of device type ", + device_type, + " but got device type ", + expr.device().type(), + " for sequence element ", + i, + " in sequence argument at position #", + pos, + " '", + name, + "'"); + } + if (expr.scalar_type() != scalar_type) { + AT_ERROR( + "Expected object of scalar type ", + scalar_type, + " but got scalar type ", + expr.scalar_type(), + " for sequence element ", + i, + " in sequence argument at position #", + pos, + " '", + name, + "'"); + } + unwrapped.emplace_back(expr.unsafeGetTensorImpl()); + } + return unwrapped; +} + +template +std::array check_intlist( + ArrayRef list, + const char* name, + int pos) { + if (list.empty()) { + // TODO: is this necessary? We used to treat nullptr-vs-not in IntList + // differently with strides as a way of faking optional. + list = {}; + } + auto res = std::array(); + if (list.size() == 1 && N > 1) { + res.fill(list[0]); + return res; + } + if (list.size() != N) { + AT_ERROR( + "Expected a list of ", + N, + " ints but got ", + list.size(), + " for argument #", + pos, + " '", + name, + "'"); + } + std::copy_n(list.begin(), N, res.begin()); + return res; +} + +using at::detail::check_size_nonnegative; + +namespace detail { + +template +TORCH_API Tensor tensor_cpu(ArrayRef values, const TensorOptions& options); + +template +TORCH_API Tensor +tensor_backend(ArrayRef values, const TensorOptions& options); + +template +TORCH_API Tensor +tensor_complex_cpu(ArrayRef values, const TensorOptions& options); + +template +TORCH_API Tensor +tensor_complex_backend(ArrayRef values, const TensorOptions& options); +} // namespace detail + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h new file mode 100644 index 0000000000000000000000000000000000000000..58a23f95c0d2c39d4450e0e70adda091278b9079 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { + +// This is in an extra file to work around strange interaction of +// bitset on Windows with operator overloading + +constexpr size_t dim_bitset_size = 64; + +static inline std::bitset dim_list_to_bitset( + OptionalIntArrayRef opt_dims, + size_t ndims) { + TORCH_CHECK( + ndims <= dim_bitset_size, + "only tensors with up to ", + dim_bitset_size, + " dims are supported"); + std::bitset seen; + if (opt_dims.has_value()) { + auto dims = opt_dims.value(); + for (const auto i : c10::irange(dims.size())) { + size_t dim = maybe_wrap_dim(dims[i], static_cast(ndims)); + TORCH_CHECK( + !seen[dim], + "dim ", + dim, + " appears multiple times in the list of dims"); + seen[dim] = true; + } + } else { + for (size_t dim = 0; dim < ndims; dim++) { + seen[dim] = true; + } + } + return seen; +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h new file mode 100644 index 0000000000000000000000000000000000000000..37d67b232a22c11fa7dccf638b7897c0854ab8bd --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h @@ -0,0 +1,24 @@ +#pragma once +#include +#include + +namespace at { + +/** + Computes ceil(a / b) +*/ +template >> +C10_ALWAYS_INLINE C10_HOST_DEVICE T ceil_div(T a, T b) { + return (a + b - 1) / b; +} + +/** + Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest + multiple of b +*/ +template +C10_ALWAYS_INLINE C10_HOST_DEVICE T round_up(T a, T b) { + return ceil_div(a, b) * b; +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h new file mode 100644 index 0000000000000000000000000000000000000000..9601a2478ddde2502581f5b1801557a1b57f3853 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h @@ -0,0 +1,232 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 80 + +/*! \brief The current ABI version of dlpack */ +#define DLPACK_ABI_VERSION 1 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + const int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + const int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h new file mode 100644 index 0000000000000000000000000000000000000000..431ff64cf553f08f92157a0f5f17a8dd8d20acf2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h @@ -0,0 +1,740 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { +class TORCH_API OperatorHandle; +} + +namespace at { + +// Function name to record NCCL metadata +extern TORCH_API const std::string kParamCommsCallName; + +// Kind of record function scope; +enum class C10_API_ENUM RecordScope : uint8_t { + // c10/ATen ops, autograd nodes + FUNCTION = 0, + // Functions/nodes called from the autograd + BACKWARD_FUNCTION, + // TorchScript functions, methods + TORCHSCRIPT_FUNCTION, + // Kernel Function dtype Tag + KERNEL_FUNCTION_DTYPE, + // Torchbind custom class, + CUSTOM_CLASS, + // Generic Build Feature + BUILD_FEATURE, + // Kernel Function dtype Tag + LITE_INTERPRETER, + // User defined scope (e.g. with record_function()) + USER_SCOPE, + // Scopes for static runtime, a specialized TorchScript interpreter + STATIC_RUNTIME_OP, + STATIC_RUNTIME_MODEL, + NUM_SCOPES, // must be the last in the list +}; + +} // namespace at + +namespace std { +template <> +struct hash { + size_t operator()(const at::RecordScope& sc) const { + return static_cast(sc); + } +}; +} // namespace std + +namespace at { + +struct TORCH_API StringView { + StringView() : StringView(nullptr) {} + explicit StringView(const char* str_ptr) + : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {} + explicit StringView(std::string str) + : owned_str_ptr_(std::make_shared(std::move(str))), + str_ptr_(owned_str_ptr_->c_str()) {} + + const char* str() const { + return str_ptr_; + } + + friend std::ostream& operator<<(std::ostream& os, const StringView& dt) { + os << dt.str(); + return os; + } + + friend bool operator==(const StringView& lhs, const StringView& rhs) { + return strcmp(lhs.str(), rhs.str()) == 0; + } + + friend bool operator!=(const StringView& lhs, const StringView& rhs) { + return !(lhs == rhs); + } + + private: + std::shared_ptr owned_str_ptr_; + const char* str_ptr_; +}; + +// Soft limit on the number of callbacks to use; +constexpr std::size_t kSoftLimitCallbacks = 4; + +// An abstract base class for various observer contexts that can be attached to +// the RecordFunction. +struct ObserverContext { + virtual ~ObserverContext() = default; + + protected: + ObserverContext() = default; +}; + +typedef c10::SmallVector CallbackHandles; +typedef c10::SmallVector, kSoftLimitCallbacks> + ObserverContextList; +typedef uint64_t RecordFunctionHandle; +struct RecordFunction; + +// +// PyTorch callbacks/observers API: +// + +/** + * RecordFunctionCallback represents a pair of callbacks to be used with + * RecordFunction, members: + * start, end - the callbacks to run when entering and exiting the scope; + * optionally, the start callback may return an ObserverContext which will + * be passed to the end callback, use appropriate constructor accordingly. + * needs_inputs - whether the callbacks need the inputs passed from the + * observed function/range; NOTE: passing the inputs incurs an additional + * overhead; sampling_probability - if not 1.0, then the callback is + * probabilistically sampled to run; NOTE: start and end callbacks always run as + * a pair and are sampled together; scopes - types of scopes to execute the + * callbacks on (see RecordScope); passing empty set means the callbacks will be + * executed for all possible scope types should_run - optional function that + * returns whether this callback should run; overwrites the effect of setting + * sampling_probability + */ +class TORCH_API RecordFunctionCallback { + public: + using StartCallback = + std::unique_ptr (*)(const RecordFunction&); + using EndCallback = void (*)(const RecordFunction&, ObserverContext*); + + // This interface supports observers that require passing an ObserverContext + // between start and end callbacks. + explicit RecordFunctionCallback( + StartCallback start, + EndCallback end = nullptr) + : start_(start), end_(end) { + scopes_.fill(true); + } + + RecordFunctionCallback& needsInputs(bool needs_inputs) { + needs_inputs_ = needs_inputs; + return *this; + } + + RecordFunctionCallback& needsOutputs(bool needs_outputs) { + needs_outputs_ = needs_outputs; + return *this; + } + + RecordFunctionCallback& needsIds(bool needs_ids) { + needs_ids_ = needs_ids; + return *this; + } + + RecordFunctionCallback& samplingProb(double sampling_prob) { + TORCH_CHECK( + sampling_prob >= 0.0 && sampling_prob <= 1.0, + "Invalid sampling probability"); + sampling_prob_ = sampling_prob; + return *this; + } + + RecordFunctionCallback& scopes( + const std::unordered_set>& scopes) { + if (!scopes.empty()) { + scopes_.fill(false); + for (auto sc : scopes) { + scopes_[static_cast(sc)] = true; + } + } else { + scopes_.fill(true); + } + return *this; + } + + bool needsInputs() const { + return needs_inputs_; + } + + bool needsOutputs() const { + return needs_outputs_; + } + + bool needsIds() const { + return needs_ids_; + } + + double samplingProb() const { + return sampling_prob_; + } + + bool checkScope(RecordScope sc) const { + return scopes_[(size_t)sc]; + } + + StartCallback start() const { + return start_; + } + + EndCallback end() const { + return end_; + } + + private: + StartCallback start_; + EndCallback end_; + double sampling_prob_ = 1.0; + std::array(RecordScope::NUM_SCOPES)> scopes_ = {}; + bool needs_inputs_ = false; + bool needs_outputs_ = false; + bool needs_ids_ = false; +}; + +// Notes: +// - two types of callbacks are provided: thread local and global +// - thread local callbacks are added/removed only for the given thread +// and are stored locally for each thread and separately from the list +// of the global callbacks +// - global callbacks are stored in a single per process list and are +// invoked by every RecordFunction, in addition to the thread local +// callbacks specific to the given thread +// - we allow the added callbacks to be sampled, by specifying a sampling +// probability for each callback pair, if the start callback is +// not picked to run, the corresponding end callback won't be called +// - a typical use case for the global callbacks is passive monitoring +// in the background (e.g. fleet-wide monitoring), without focusing on +// the specific piece of code +// - in contrast, thread local callbacks are enabled locally, on demand, +// for the specific piece of code (range) and are not sampled +// - a typical use case for thread local callbacks is profiler and code +// execution tracer +// - note, thread local callbacks are automatically propagated with +// ThreadLocalState across JIT continuations and async tasks (at::launch) + +typedef uint64_t CallbackHandle; + +constexpr CallbackHandle INVALID_CALLBACK_HANDLE{0}; + +// It is unnecessary to use atomic operations for enabling +// thread-local function callbacks. Moreover, it prevents saving to +// ThreadLocalState because std::atomic is non-copyable. +struct RecordFunctionCallbacksEntry { + RecordFunctionCallbacksEntry(RecordFunctionCallback cb, CallbackHandle h) + : callback_(cb), handle_(h) {} + + RecordFunctionCallback callback_; + bool enabled_{true}; + CallbackHandle handle_; +}; + +// Holds pairs (callbacks, unique_id) +using RecordFunctionCallbacks = std::vector; + +// Generated by the callback managers to determine which functions to run. +struct StepCallbacks { + StepCallbacks() = default; + StepCallbacks(uint64_t thread_id, RecordScope scope) + : thread_id_{thread_id}, scope_{scope} {} + + bool empty() const { + return callbacks_.empty(); + } + + struct StartEndPair { + RecordFunctionCallback::StartCallback start_; + RecordFunctionCallback::EndCallback end_; + }; + + using StartEndPairs = c10::SmallVector; + + StartEndPairs callbacks_; + uint64_t thread_id_{0}; + RecordScope scope_{RecordScope::FUNCTION}; + bool needs_inputs_{false}; + bool needs_outputs_{false}; + bool needs_ids_{false}; +}; + +struct TORCH_API RecordFunction { + // Default constructor is used with before function called afterwards: + // scope - record scope that this function tracks + // pre_sampled - whether this RecordFunction was already pre-sampled with + // kLowProb probability + explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION); + explicit RecordFunction(StepCallbacks&& step_callbacks); + + template + void before( + F fn, + c10::ArrayRef args, + int64_t current_sequence_nr = -1) { + if (!isActive()) { + return; + } + inputs_ = args; + before(fn, current_sequence_nr); + } + + template + void before( + F fn, + const std::vector* args, + int64_t current_sequence_nr = -1) { + before( + std::move(fn), + c10::ArrayRef(args->data(), args->size()), + current_sequence_nr); + } + + // Destructor calls end callbacks + virtual ~RecordFunction(); + + RecordFunction(const RecordFunction&) = delete; + RecordFunction& operator=(const RecordFunction&) = delete; + + const char* name() const; + + int64_t seqNr() const { + return sequence_nr_; + } + + c10::ArrayRef inputs() const { +#ifndef NDEBUG + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + inputs_valid_, "Called inputs() outside RecordFunction start callback"); +#endif + return inputs_; + } + + const std::vector& outputs() const { + return outputs_; + } + + void setOutputs(std::vector&& outputs) { + outputs_ = std::move(outputs); + } + + void setOutputs(c10::ArrayRef outputs) { + outputs_ = outputs.vec(); + } + + size_t num_inputs() const; + size_t num_outputs() const; + + // Retrieves the thread_id that this RecordFunction ran start callbacks with. + // Useful for writing thread safe end callbacks that may be potentially + // executed in a different thread (async ops) + uint64_t threadId() const { + return step_callbacks_.thread_id_; + } + + // For backward functions - thread id of the corresponding forward function, + // or zero otherwise; + // used alongside with sequence number to correlate backward functions with + // the forward ones + uint64_t forwardThreadId() const { + return fwd_thread_id_; + } + + void setForwardThreadId(uint64_t thread_id) { + fwd_thread_id_ = thread_id; + } + + RecordScope scope() const { + return step_callbacks_.scope_; + } + + // Returns logical thread_id for the current thread + static uint64_t currentThreadId(); + + // Internal functions, do not use directly; + // used in python's context manager + + // before functions initialize RecordFunction members and call + // start callbacks + using schema_ref_t = std::reference_wrapper; + void before(const char* name, int64_t sequence_nr = -1); + void before(std::string name, int64_t sequence_nr = -1); + void before(schema_ref_t schema, int64_t sequence_nr = -1); + + // Sets node ID for distributed profiling + static void setDefaultNodeId(int64_t defaultNodeId); + // Gets node ID for distributed profiling + static int64_t getDefaultNodeId(); + + // Calls end callbacks. After end(), accessors will no longer provide useful + // results. + void end(); + + // Internal-only, used only force async event for distributed events + // profiling. + void _setAsync(); + + // Returns whether this RecordFunction corresponds to an async event or not. + bool isAsync() const; + + // Returns whether this RecordFunction corresponds to NCCL metadata collection + // or not. + bool isNcclMeta() const { + return is_nccl_meta_; + } + + // Internal-only, used to denote out variant used for Static Runtime execution + void _setStaticRuntimeOutVariant(); + bool isStaticRuntimeOutVariant() const; + + RecordFunctionHandle handle() const { + return handle_; + } + + c10::optional operator_name() const; + + // This method returns a copy of the FunctionSchema and can be expensive. + c10::optional operator_schema() const; + + void setHandle(RecordFunctionHandle handle) { + handle_ = handle; + } + + // Whether this RecordFunction runs any callbacks. + bool isActive() const { + return !step_callbacks_.empty(); + } + + bool needsInputs() const { + return step_callbacks_.needs_inputs_; + } + + bool needsOutputs() const { + return step_callbacks_.needs_outputs_; + } + + int64_t debugHandle() const { + return debug_handle_; + } + + void setDebugHandle(int64_t debug_handle) { + debug_handle_ = debug_handle; + } + + void invalidateInputs() { +#ifndef NDEBUG + inputs_valid_ = false; +#endif + } + + private: + void runStartCallbacks(); + + StepCallbacks step_callbacks_; + + // In cases when RecordFunction might be active but we chose not to + // use the observers (e.g. operator is not observed), this boolean + // flag is used to check whether the start callbacks were called + bool called_start_callbacks_ = false; + +#ifndef NDEBUG + bool inputs_valid_ = false; +#endif + + // Stores various ObserverContext objects with event metadata for callbacks. + ObserverContextList ctx_; + + std::variant fn_; + + int64_t sequence_nr_ = -1; + c10::ArrayRef inputs_; + std::vector outputs_; + + // For backward functions - thread id of the forward function + uint64_t fwd_thread_id_ = 0; + + // Unique id for this RecordFunction, used in callbacks to track start + // and end of ranges + RecordFunctionHandle handle_{0}; + + // Whether this record_function corresponds to an async event or not. Async + // events can complete in different threads or follow a future-like pattern + // of use. + bool is_async_{false}; + + // Debug handles are used for lazy annotation of module hierarchy + // and callstack. + // This is specifically is useful for mobile runtime, where generated + // debug handles can be lazily symbolicated using debug information + int64_t debug_handle_{-1}; + + // Whether this RecordFunction is used for an out variant run with + // Static Runtime + bool is_static_runtime_out_variant_{false}; + + // Whether this RecordFunction is used for NCCL metadata collection + bool is_nccl_meta_{false}; +}; + +TORCH_API StepCallbacks getStepCallbacks(RecordScope scope); + +TORCH_API c10::optional getStepCallbacksUnlessEmpty( + RecordScope scope); + +namespace detail { +template +void record_function_with_scope( + RecordFunction& guard, + F fn, + const Inputs& inputs, + Args&&... args) { + if (guard.needsInputs()) { + guard.before( + fn, + c10::ArrayRef(inputs.data(), inputs.size()), + std::forward(args)...); + } else { + guard.before(fn, std::forward(args)...); + } +} + +template +void record_function_with_scope_and_debug_handle( + RecordFunction& guard, + F fn, + int64_t debug_handle, + const Inputs& inputs, + Args&&... args) { + guard.setDebugHandle(debug_handle); + if (guard.needsInputs()) { + guard.before( + fn, + c10::ArrayRef(inputs.data(), inputs.size()), + std::forward(args)...); + } else { + guard.before(fn, std::forward(args)...); + } +} + +template +void record_function_with_scope( + RecordFunction& guard, + F fn, + c10::ArrayRef inputs, + Args&&... args) { + return record_function_with_scope< + c10::ArrayRef, + F, + Args...>(guard, std::move(fn), inputs, std::forward(args)...); +} + +template +void record_function_with_scope_and_debug_handle( + RecordFunction& guard, + F fn, + int64_t debug_handle, + c10::ArrayRef inputs, + Args&&... args) { + return record_function_with_scope_and_debug_handle< + c10::ArrayRef, + F, + Args...>( + guard, std::move(fn), debug_handle, inputs, std::forward(args)...); +} + +} // namespace detail + +// optional argument - function's seq_no +#define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \ + at::RecordFunction guard(scope); \ + if (guard.isActive()) { \ + ::at::detail::record_function_with_scope( \ + guard, fn, inputs, ##__VA_ARGS__); \ + } + +#define RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS( \ + scope, fn, inputs, outputs, ...) \ + at::RecordFunction guard(scope); \ + if (guard.isActive()) { \ + if (guard.needsInputs()) { \ + guard.before(fn, inputs, ##__VA_ARGS__); \ + } else { \ + guard.before(fn, ##__VA_ARGS__); \ + } \ + if (guard.needsOutputs()) { \ + guard.setOutputs(outputs); \ + } \ + } + +#define RECORD_FUNCTION(fn, inputs, ...) \ + RECORD_FUNCTION_WITH_SCOPE( \ + at::RecordScope::FUNCTION, fn, inputs, ##__VA_ARGS__) + +#define RECORD_TORCHSCRIPT_FUNCTION(mn, inputs) \ + RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::TORCHSCRIPT_FUNCTION, mn, inputs) + +#define RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(fn, inputs, outputs, ...) \ + RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS( \ + at::RecordScope::FUNCTION, fn, inputs, outputs, ##__VA_ARGS__) + +// Custom user scopes in C++; similar to Python's 'with record_function("..."):' +#define RECORD_USER_SCOPE(fn) \ + RECORD_FUNCTION_WITH_SCOPE( \ + at::RecordScope::USER_SCOPE, fn, c10::ArrayRef{}) + +// RECORD_USER_SCOPE with inputs +#define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \ + RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::USER_SCOPE, fn, inputs) + +// Helper macro to pass in debug handle that is used to +// post process events +#define RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \ + scope, fn, debug_handle, inputs, ...) \ + at::RecordFunction guard(scope); \ + if (guard.isActive()) { \ + ::at::detail::record_function_with_scope_and_debug_handle( \ + guard, fn, debug_handle, inputs, ##__VA_ARGS__); \ + } + +// Helper macros to record LITE INTERPETER scope events with debug handles +#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \ + fn, debug_handle, inputs) \ + RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \ + at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs) + +// Bookend to the RECORD_FUNCTION macros. Use this after the kernel +// launch to let the profiler bind the outputs to the op that produced +// them. Note that guard is declared by RECORD_FUNCTION so this macro +// needs to be called from the same scope as RECORD_FUNCTION +#define RECORD_OUTPUTS(outputs) \ + if (guard.needsOutputs()) { \ + guard.setOutputs( \ + std::vector(outputs.begin(), outputs.end())); \ + } + +/** + * addThreadLocalCallback adds a thread local callback to run with + * RecordFunction, returns handle to use with removeThreadLocalCallback + */ +TORCH_API CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb); + +/** + * hasThreadLocalCallbacks returns whether there're callbacks registered + * with addThreadLocalCallback + */ +TORCH_API bool hasThreadLocalCallbacks(); + +/** + * clearThreadLocalCallbacks removes all thread local callbacks + */ +TORCH_API void clearThreadLocalCallbacks(); + +/** + * addGlobalCallback adds a global callback to run with RecordFunction: + * + * only during the program initialization + */ +TORCH_API CallbackHandle addGlobalCallback(RecordFunctionCallback cb); + +/** + * removeCallback removes a callback given the handle returned by + * addThreadLocalCallback or addGlobalCallback; + * + * no other code can run simultaneously + */ +TORCH_API void removeCallback(CallbackHandle handle); + +/** + * Prevent the given callback from executing. If handle is invalid, + * does nothing. + */ +TORCH_API void disableCallback(CallbackHandle handle); + +/** + * Allow the given callback, previously disabled with disableCallback, to + * execute again. If handle is invalid, does nothing. + */ +TORCH_API void reenableCallback(CallbackHandle handle); + +/** + * hasGlobalCallbacks returns whether there're global callbacks + * registered with pushGlobalCallback + */ +TORCH_API bool hasGlobalCallbacks(); + +/** + * clearGlobalCallbacks removes all global callbacks + */ +TORCH_API void clearGlobalCallbacks(); + +// for both thread local and global callbacks +TORCH_API bool hasCallbacks(); +TORCH_API void clearCallbacks(); + +/** + * enableRecordFunction enables RecordFunction thread locally + */ +TORCH_API void enableRecordFunction(bool enable = true); + +/** + * isRecordFunctionEnabled returns whether RecordFunction + * is enabled thread locally + */ +TORCH_API bool isRecordFunctionEnabled(); + +class TORCH_API RecordFunctionGuard { + public: + explicit RecordFunctionGuard(bool is_enabled = true) + : prev_value_(isRecordFunctionEnabled()) { + enableRecordFunction(is_enabled); + } + + virtual ~RecordFunctionGuard() { + enableRecordFunction(prev_value_); + } + + private: + bool prev_value_ = false; +}; + +class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard { + public: + DisableRecordFunctionGuard() : RecordFunctionGuard(false) {} + ~DisableRecordFunctionGuard() override = default; +}; + +struct TORCH_API RecordFunctionTLS { + // Thread local vector of callbacks, holds pairs (callbacks, unique_id); + // must be sorted in increasing handles order + RecordFunctionCallbacks sorted_tls_callbacks_; + + bool tls_record_function_enabled_ = true; +}; + +TORCH_API const RecordFunctionTLS& get_record_function_tls_(); + +TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); + +TORCH_API void set_record_function_seed_for_testing(uint32_t seed); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py new file mode 100644 index 0000000000000000000000000000000000000000..49ccc8e69523f7dbee2335b788a2cb3a7db618a2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py @@ -0,0 +1,28 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/ns/_numeric_suite.py`, while adding an import statement +here. +""" + +from torch.ao.ns._numeric_suite import ( + _convert_tuple_to_list, + _dequantize_tensor_list, + _find_match, + _get_logger_dict_helper, + _is_identical_module_type, + compare_model_outputs, + compare_model_stub, + compare_weights, + get_logger_dict, + get_matching_activations, + Logger, + NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST, + OutputLogger, + prepare_model_outputs, + prepare_model_with_stubs, + Shadow, + ShadowLogger, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..69a5d730bfb68e89e24beb04ad13fd3fa5881ae9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py @@ -0,0 +1,32 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/fake_quantize.py`, while adding an import statement +here. +""" + +from torch.ao.quantization.fake_quantize import ( + _is_fake_quant_script_module, + _is_per_channel, + _is_per_tensor, + _is_symmetric_quant, + default_fake_quant, + default_fixed_qparams_range_0to1_fake_quant, + default_fixed_qparams_range_neg1to1_fake_quant, + default_fused_act_fake_quant, + default_fused_per_channel_wt_fake_quant, + default_fused_wt_fake_quant, + default_histogram_fake_quant, + default_per_channel_weight_fake_quant, + default_weight_fake_quant, + disable_fake_quant, + disable_observer, + enable_fake_quant, + enable_observer, + FakeQuantize, + FakeQuantizeBase, + FixedQParamsFakeQuantize, + FusedMovingAvgObsFakeQuantize, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..6b704fa8094e8b367e9eba47102863ba845415b9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py @@ -0,0 +1,22 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/fuse_modules.py`, while adding an import statement +here. +""" + +# TODO: These functions are not used outside the `fuse_modules.py` +# Keeping here for now, need to remove them later. +from torch.ao.quantization.fuse_modules import ( + _fuse_modules, + _get_module, + _set_module, + fuse_known_modules, + fuse_modules, + get_fuser_method, +) + +# for backward compatiblity +from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn, fuse_conv_bn_relu diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..281b4c8770a119f4c33bb9130f092d92116905af Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25a1dbd691b06742ef92505fb9f06f787d0c6172 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e152dd07205d0a20400f67caabba463714acea1d Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py new file mode 100644 index 0000000000000000000000000000000000000000..7acea4f84a2a0a82f134b6790e573f8f1cb677f2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py @@ -0,0 +1,38 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate files under `torch/ao/quantization/fx/`, while adding an import statement +here. +""" +from torch.ao.quantization.fx._equalize import ( + _convert_equalization_ref, + _InputEqualizationObserver, + _WeightEqualizationObserver, + calculate_equalization_scale, + clear_weight_quant_obs_node, + convert_eq_obs, + CUSTOM_MODULE_SUPP_LIST, + custom_module_supports_equalization, + default_equalization_qconfig, + EqualizationQConfig, + fused_module_supports_equalization, + get_equalization_qconfig_dict, + get_layer_sqnr_dict, + get_op_node_and_weight_eq_obs, + input_equalization_observer, + is_equalization_observer, + maybe_get_next_equalization_scale, + maybe_get_next_input_eq_obs, + maybe_get_weight_eq_obs_node, + nn_module_supports_equalization, + node_supports_equalization, + remove_node, + reshape_scale, + scale_input_observer, + scale_weight_functional, + scale_weight_node, + update_obs_for_equalization, + weight_equalization_observer, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py new file mode 100644 index 0000000000000000000000000000000000000000..67527080304fb31ddc54fe254533e2196f77a616 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py @@ -0,0 +1,9 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate files under `torch/ao/quantization/fx/`, while adding an import statement +here. +""" +from torch.ao.quantization.fx.fuse import fuse diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py new file mode 100644 index 0000000000000000000000000000000000000000..e29337b3f861e5b54dc9f37d39d12ad975ad1315 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py @@ -0,0 +1,9 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate files under `torch/ao/quantization/fx/`, while adding an import statement +here. +""" +from torch.ao.quantization.fx.fuse_handler import DefaultFuseHandler, FuseHandler diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py new file mode 100644 index 0000000000000000000000000000000000000000..a422cdd3142e04c8d16f495cc6cd65823451810b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py @@ -0,0 +1,9 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate files under `torch/ao/quantization/fx/`, while adding an import statement +here. +""" +from torch.ao.quantization.utils import Pattern, QuantizerCls diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ef35559884b7c430f1d5c72b21f72979108469a5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py @@ -0,0 +1,20 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +appropriate files under `torch/ao/quantization/fx/`, while adding an import statement +here. +""" +from torch.ao.quantization.fx.utils import ( + all_node_args_have_no_tensors, + assert_and_get_unique_device, + create_getattr_from_value, + get_custom_module_class_keys, + get_linear_prepack_op_for_dtype, + get_new_attr_name_with_prefix, + get_non_observable_arg_indexes_and_types, + get_qconv_prepack_op, + graph_module_from_producer_nodes, + maybe_get_next_module, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb7e14110cb9cdc4e9c2c418c6776ea6445f0d3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py @@ -0,0 +1,30 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/qconfig.py`, while adding an import statement +here. +""" +from torch.ao.quantization.qconfig import ( + _add_module_to_qconfig_obs_ctr, + _assert_valid_qconfig, + default_activation_only_qconfig, + default_debug_qconfig, + default_dynamic_qconfig, + default_per_channel_qconfig, + default_qat_qconfig, + default_qat_qconfig_v2, + default_qconfig, + default_weight_only_qconfig, + float16_dynamic_qconfig, + float16_static_qconfig, + float_qparams_weight_only_qconfig, + get_default_qat_qconfig, + get_default_qconfig, + per_channel_dynamic_qconfig, + QConfig, + qconfig_equals, + QConfigAny, + QConfigDynamic, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py new file mode 100644 index 0000000000000000000000000000000000000000..8555f03792661f39c85c8facf3f911786cc25d0f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py @@ -0,0 +1,10 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/quant_type.py`, while adding an import statement +here. +""" + +from torch.ao.quantization.quant_type import _get_quant_type_to_str, QuantType diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..8b44a980ce82fbfa5a81ad906499806cf99b876f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py @@ -0,0 +1,29 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/quantization_mappings.py`, while adding an import statement +here. +""" +from torch.ao.quantization.quantization_mappings import ( + _get_special_act_post_process, + _has_special_act_post_process, + _INCLUDE_QCONFIG_PROPAGATE_LIST, + DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, + DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS, + DEFAULT_MODULE_TO_ACT_POST_PROCESS, + DEFAULT_QAT_MODULE_MAPPINGS, + DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS, + DEFAULT_STATIC_QUANT_MODULE_MAPPINGS, + get_default_compare_output_module_list, + get_default_dynamic_quant_module_mappings, + get_default_float_to_quantized_operator_mappings, + get_default_qat_module_mappings, + get_default_qconfig_propagation_list, + get_default_static_quant_module_mappings, + get_dynamic_quant_module_class, + get_quantized_operator, + get_static_quant_module_class, + no_observer_set, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..600d3a46fed0346e3ae8909872cd5bf3c733860c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py @@ -0,0 +1,30 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/quantize.py`, while adding an import statement +here. +""" + +from torch.ao.quantization.quantize import ( + _add_observer_, + _convert, + _get_observer_dict, + _get_unique_devices_, + _is_activation_post_process, + _observer_forward_hook, + _propagate_qconfig_helper, + _register_activation_post_process_hook, + _remove_activation_post_process, + _remove_qconfig, + add_quant_dequant, + convert, + prepare, + prepare_qat, + propagate_qconfig_, + quantize, + quantize_dynamic, + quantize_qat, + swap_module, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py new file mode 100644 index 0000000000000000000000000000000000000000..d3fd5c63683dc572c35cabc202ee4ddb2b0053c6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py @@ -0,0 +1,10 @@ +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/stubs.py`, while adding an import statement +here. +""" + +from torch.ao.quantization.stubs import DeQuantStub, QuantStub, QuantWrapper diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7d51d58f38d7462713f84ab62427852c1dd8e52c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py @@ -0,0 +1,29 @@ +# flake8: noqa: F401 +r""" +Utils shared by different modes of quantization (eager/graph) + +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/utils.py`, while adding an import statement +here. +""" + +from torch.ao.quantization.utils import ( + activation_dtype, + activation_is_int8_quantized, + activation_is_statically_quantized, + calculate_qmin_qmax, + check_min_max_valid, + get_combined_dict, + get_qconfig_dtypes, + get_qparam_dict, + get_quant_type, + get_swapped_custom_module_class, + getattr_from_fqn, + is_per_channel, + is_per_tensor, + weight_dtype, + weight_is_quantized, + weight_is_statically_quantized, +)