diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..701035b7b2453910a45df00470c521178b2a6c7b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..631f443a5e7b891e8071b7eb086251f25fdf70ba
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00a65c53fca2def72ca9c16b610f7dda404eeed3
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..722ada8e038f3905c8130265c1468e8b1f6327d7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..575912000bef53c7862e759ddf94dfe24e176e1a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/comm.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5517ebbb453d3ccb03eb8b0fa83b1198c56bd469
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/nvtx.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8bb6f72860188fc180630fb42f1c03c298e46e1
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/profiler.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d08365e63a74ea1b351a5635078ee211861fa3f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/__pycache__/sparse.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..88ff04d86648806a21b180ae79e6a58bf5b22685
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/amp/autocast_mode.py
@@ -0,0 +1,144 @@
+import collections
+import functools
+
+import torch
+
+try:
+    import numpy as np
+
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+from typing import Any
+
+__all__ = ["autocast", "custom_fwd", "custom_bwd"]
+
+
+class autocast(torch.amp.autocast_mode.autocast):
+    r"""See :class:`torch.autocast`.
+
+    ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)``
+    """
+
+    def __init__(
+        self,
+        enabled: bool = True,
+        dtype: torch.dtype = torch.float16,
+        cache_enabled: bool = True,
+    ):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = "cuda"
+            self.fast_dtype = dtype
+            return
+        super().__init__(
+            "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
+        )
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            return self
+        return super().__enter__()
+
+    # TODO: discuss a unified TorchScript-friendly API for autocast
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return super().__call__(func)
+
+
+# Casts Tensors and containers of Tensors.  Special-cases passthroughs for strings and np.ndarrays, which
+# may be falsely detected as "Iterables."
+def _cast(value, dtype):
+    if isinstance(value, torch.Tensor):
+        is_eligible = (
+            value.is_floating_point()
+            and value.is_cuda
+            and (value.dtype is not torch.float64)
+        )
+        return value.to(dtype) if is_eligible else value
+    elif isinstance(value, (str, bytes)):
+        return value
+    elif HAS_NUMPY and isinstance(value, np.ndarray):
+        return value
+    elif isinstance(value, collections.abc.Mapping):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
+    elif isinstance(value, collections.abc.Iterable):
+        iterable = (_cast(v, dtype) for v in value)
+        if isinstance(value, (list, tuple)):
+            return type(value)(iterable)
+        else:
+            return iterable
+    else:
+        return value
+
+
+# custom_fwd is a decorator that may or may not be used with arguments, following
+# https://github.com/dabeaz/python-cookbook/tree/master/src/9/defining_a_decorator_that_takes_an_optional_argument.
+# this works:
+#     @custom_fwd
+#     def forward(...):
+# this also works:
+#     @custom_fwd(cast_inputs=torch.float)
+#     def forward(...):
+def custom_fwd(fwd=None, *, cast_inputs=None):
+    """
+    Create a helper decorator for ``forward`` methods of custom autograd functions.
+
+    Autograd functions are subclasses of :class:`torch.autograd.Function`.
+    See the :ref:`example page<amp-custom-examples>` for more detail.
+
+    Args:
+        cast_inputs (:class:`torch.dtype` or None, optional, default=None):  If not ``None``,
+            when ``forward`` runs in an autocast-enabled region, casts incoming
+            floating-point CUDA Tensors to the target dtype (non-floating-point Tensors are not affected),
+            then executes ``forward`` with autocast disabled.
+            If ``None``, ``forward``'s internal ops execute with the current autocast state.
+
+    .. note::
+        If the decorated ``forward`` is called outside an autocast-enabled region,
+        :func:`custom_fwd<custom_fwd>` is a no-op and ``cast_inputs`` has no effect.
+    """
+    if fwd is None:
+        return functools.partial(custom_fwd, cast_inputs=cast_inputs)
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        args[0]._dtype = torch.get_autocast_gpu_dtype()
+        if cast_inputs is None:
+            args[0]._fwd_used_autocast = torch.is_autocast_enabled()
+            return fwd(*args, **kwargs)
+        else:
+            autocast_context = torch.is_autocast_enabled()
+            args[0]._fwd_used_autocast = False
+            if autocast_context:
+                with autocast(enabled=False):
+                    return fwd(*_cast(args, cast_inputs), **_cast(kwargs, cast_inputs))
+            else:
+                return fwd(*args, **kwargs)
+
+    return decorate_fwd
+
+
+# Autograd ensures incoming gradients are the same type as forward outputs.  Allowing a separate
+# cast_inputs argument on custom_bwd is unnecessary and could cause errors if it doesn't match
+# cast_inputs supplied to custom_fwd.
+def custom_bwd(bwd):
+    """Create a helper decorator for backward methods of custom autograd functions.
+
+    Autograd functions are subclasses of :class:`torch.autograd.Function`.
+    Ensures that ``backward`` executes with the same autocast state as ``forward``.
+    See the :ref:`example page<amp-custom-examples>` for more detail.
+    """
+
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
+            return bwd(*args, **kwargs)
+
+    return decorate_bwd
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/error.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3bfbab6ad1617386d77127c24450f46302e0ed6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/graphs.py
@@ -0,0 +1,479 @@
+import gc
+from typing import Optional
+
+import torch
+from torch.utils import _pytree
+from .._utils import _dummy_type
+
+if not hasattr(torch._C, "_CudaStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_CUDAGraph"] = _dummy_type("_CUDAGraph")
+    torch._C.__dict__["_graph_pool_handle"] = _dummy_type("_graph_pool_handle")
+    torch._C.__dict__["_cuda_isCurrentStreamCapturing"] = _dummy_type(
+        "_cuda_isCurrentStreamCapturing"
+    )
+
+from torch._C import (  # noqa: F401
+    _cuda_isCurrentStreamCapturing,
+    _CUDAGraph,
+    _graph_pool_handle,
+)
+
+
+def is_current_stream_capturing():
+    r"""Return True if CUDA graph capture is underway on the current CUDA stream, False otherwise.
+
+    If a CUDA context does not exist on the current device, returns False without initializing the context.
+    """
+    return _cuda_isCurrentStreamCapturing()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+def graph_pool_handle():
+    r"""Return an opaque token representing the id of a graph memory pool.
+
+    See :ref:`Graph memory management<graph-memory-management>`.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+    return _graph_pool_handle()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+class CUDAGraph(torch._C._CUDAGraph):
+    r"""Wrapper around a CUDA graph.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+
+    def __new__(cls):
+        return super().__new__(cls)
+
+    def capture_begin(self, pool=None, capture_error_mode="global"):
+        r"""Begin capturing CUDA work on the current stream.
+
+        Typically, you shouldn't call ``capture_begin`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_begin`` internally.
+
+        Arguments:
+            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
+                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
+                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
+                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+        """  # noqa: B950
+        super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
+
+    def capture_end(self):
+        r"""End CUDA graph capture on the current stream.
+
+        After ``capture_end``, ``replay`` may be called on this instance.
+
+        Typically, you shouldn't call ``capture_end`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_end`` internally.
+        """
+        super().capture_end()
+
+    def replay(self):
+        r"""Replay the CUDA work captured by this graph."""
+        super().replay()
+
+    def reset(self):
+        r"""Delete the graph currently held by this instance."""
+        super().reset()
+
+    def pool(self):
+        r"""Return an opaque token representing the id of this graph's memory pool.
+
+        This id can optionally be passed to another graph's ``capture_begin``,
+        which hints the other graph may share the same memory pool.
+        """
+        return super().pool()
+
+    def enable_debug_mode(self):
+        r"""Enable debugging mode for CUDAGraph.debug_dump."""
+        return super().enable_debug_mode()
+
+    def debug_dump(self, debug_path):
+        r"""
+        Arguments:
+            debug_path (required): Path to dump the graph to.
+
+        Calls a debugging function to dump the graph if the debugging is
+        enabled via CUDAGraph.enable_debug_mode()
+        """
+        return super().debug_dump(debug_path)
+
+
+class graph:
+    r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.
+
+    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
+    detailed use, and constraints.
+
+    Arguments:
+        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
+        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
+            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
+        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
+            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
+        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
+            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
+            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
+            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+
+    .. note::
+        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
+        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. _cudaStreamCaptureMode:
+        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
+    """  # noqa: B950
+
+    default_capture_stream: Optional["torch.cuda.Stream"] = None
+
+    def __init__(
+        self,
+        cuda_graph,
+        pool=None,
+        stream=None,
+        capture_error_mode: str = "global",
+    ):
+        # Lazy-init of default_capture_stream helps avoid circular-import errors.
+        # Not thread safe, but graphs already have the general (explicitly documented)
+        # restriction that only one capture may be underway at a time in the process.
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.cuda.Stream()
+
+        self.pool = () if pool is None else (pool,)
+        self.capture_stream = (
+            stream if stream is not None else self.__class__.default_capture_stream
+        )
+        assert self.capture_stream is not None
+        self.stream_ctx = torch.cuda.stream(self.capture_stream)
+        self.cuda_graph = cuda_graph
+        self.capture_error_mode = capture_error_mode
+
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.cuda.synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        # Stackoverflow seems comfortable with this pattern
+        # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487
+        self.stream_ctx.__enter__()
+
+        self.cuda_graph.capture_begin(
+            *self.pool, capture_error_mode=self.capture_error_mode
+        )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.cuda_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(
+    callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
+):
+    r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
+
+    Each graphed callable's forward pass runs its source callable's
+    forward CUDA work as a CUDA graph inside a single autograd node.
+
+    The graphed callable's forward pass also appends
+    a backward node to the autograd graph. During backward, this node runs the
+    callable's backward work as a CUDA graph.
+
+    Therefore, each graphed callable should be a drop-in replacement for its source callable
+    in an autograd-enabled training loop.
+
+    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.
+
+    If you pass a tuple of several callables, their captures will use the same memory pool.
+    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.
+
+    Arguments:
+        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
+            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
+            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
+            they'll run in the live workload.
+        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
+            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
+            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
+        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
+            11 iterations for warm up. Default: ``3``.
+        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
+            (and therefore their grad is always zero) is an error. Defaults to False.
+        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+    .. note::
+        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
+        that's expected for the corresponding real input in the training loop.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.
+
+    .. warning::
+        Returned callables do not support higher order differentiation (e.g., double backward).
+
+    .. warning::
+        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
+        may be trainable. Buffers must have ``requires_grad=False``.
+
+    .. warning::
+        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
+        you may not add or remove any of that Module's parameters or buffers.
+
+    .. warning::
+        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
+        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
+        through :func:`~torch.cuda.make_graphed_callables` is allowed.
+
+    .. warning::
+        When running a graphed callable, you must pass its arguments in the same order and format
+        they appeared in that callable's ``sample_args``.
+
+    .. warning::
+        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
+        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
+    """
+    if torch.is_autocast_enabled() and torch.is_autocast_cache_enabled():
+        raise RuntimeError(
+            "make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`."
+        )
+
+    just_one_callable = False
+
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    flatten_sample_args = []
+
+    for c, args in zip(callables, sample_args):
+        if isinstance(c, torch.nn.Module):
+            assert (
+                len(c._backward_hooks) == 0
+                and len(c._forward_hooks) == 0
+                and len(c._forward_pre_hooks) == 0
+            ), (
+                "Modules must not have hooks registered at the time they are passed. However, registering hooks "
+                + "on modules after passing them through make_graphed_callables is allowed."
+            )
+            assert all(b.requires_grad is False for b in c.buffers()), (
+                "In any :class:`~torch.nn.Module` passed to "
+                + ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have "
+                + "``requires_grad=False``."
+            )
+        flatten_arg = _pytree.arg_tree_leaves(*args)
+        flatten_sample_args.append(tuple(flatten_arg))
+        assert all(isinstance(arg, torch.Tensor) for arg in flatten_arg), (
+            "In the beta API, sample_args "
+            + "for each callable must contain only Tensors. Other types are not allowed."
+        )
+
+    # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly
+    # passes to forward (ie, its sample_args) AND the module's parameter attributes.
+    per_callable_len_user_args = [len(args) for args in flatten_sample_args]
+    per_callable_module_params = [
+        tuple(c.parameters()) if isinstance(c, torch.nn.Module) else ()
+        for c in callables
+    ]
+    per_callable_static_input_surfaces = [
+        flatten_sample_args[i] + per_callable_module_params[i]
+        for i in range(len(callables))
+    ]
+
+    fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+    bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+
+    mempool = graph_pool_handle() if pool is None else pool
+
+    # Warmup
+    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
+    # from ending up in any captures.
+    torch.cuda.synchronize()
+    with torch.cuda.stream(torch.cuda.Stream()):
+        for func, args, static_input_surface in zip(
+            callables, sample_args, per_callable_static_input_surfaces
+        ):
+            for _ in range(num_warmup_iters):
+                outputs = _pytree.tree_leaves(func(*args))
+                grad_inputs = torch.autograd.grad(
+                    outputs=tuple(o for o in outputs if o.requires_grad),
+                    inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                    grad_outputs=tuple(
+                        torch.empty_like(o) for o in outputs if o.requires_grad
+                    ),
+                    only_inputs=True,
+                    allow_unused=allow_unused_input,
+                )
+            del outputs, grad_inputs  # type: ignore[possibly-undefined]
+    torch.cuda.synchronize()
+
+    # All captures here share a mempool. To avoid replays corrupting each other's memory,
+    # the safest approach is to capture all passes in the same order they'll run:
+    # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1.
+
+    # Capture forward graphs
+    per_callable_static_outputs = []
+    per_callable_output_unflatten_spec = []
+    for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs):
+        with torch.cuda.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        flatten_outputs, spec = _pytree.tree_flatten(outputs)
+        per_callable_static_outputs.append(tuple(flatten_outputs))
+        per_callable_output_unflatten_spec.append(spec)
+
+    # Capture backward graphs in reverse order
+    per_callable_static_grad_outputs = []
+    per_callable_static_grad_inputs = []
+    for static_input_surface, static_outputs, bwd_graph, module_params in zip(
+        reversed(per_callable_static_input_surfaces),
+        reversed(per_callable_static_outputs),
+        reversed(bwd_graphs),
+        reversed(per_callable_module_params),
+    ):
+        # For now, assumes all static_outputs require grad
+        # assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad."
+        static_grad_outputs = tuple(
+            torch.empty_like(o) if o.requires_grad else None for o in static_outputs
+        )
+
+        with torch.cuda.graph(bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(
+                outputs=tuple(o for o in static_outputs if o.requires_grad),
+                inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                only_inputs=True,
+                allow_unused=allow_unused_input,
+            )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad.
+        # I couldn't think of a slick one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in static_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)  # type: ignore[arg-type]
+        static_grad_inputs = tuple(static_grad_inputs)  # type: ignore[assignment]
+
+        per_callable_static_grad_outputs.append(static_grad_outputs)
+        per_callable_static_grad_inputs.append(static_grad_inputs)
+
+    # Reverses the most recent two lists
+    per_callable_static_grad_outputs.reverse()
+    per_callable_static_grad_inputs.reverse()
+    # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
+
+    def make_graphed_autograd_function(
+        fwd_graph,
+        bwd_graph,
+        module_params,
+        len_user_args,
+        output_unflatten_spec,
+        static_input_surface,
+        static_outputs,
+        static_grad_outputs,
+        static_grad_inputs,
+    ):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+                # At this stage, only the user args may (potentially) be new tensors.
+                for i in range(len_user_args):
+                    if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
+                        static_input_surface[i].copy_(inputs[i])
+                fwd_graph.replay()
+                assert isinstance(static_outputs, tuple)
+                return tuple(o.detach() for o in static_outputs)
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, *grads):
+                assert len(grads) == len(static_grad_outputs)
+                for g, grad in zip(static_grad_outputs, grads):
+                    if g is not None:
+                        # don't copy if autograd gods have been kind and the
+                        # incoming grad is already in the right place
+                        if g.data_ptr() != grad.data_ptr():
+                            g.copy_(grad)
+                bwd_graph.replay()
+
+                # Input args that didn't require grad expect a None gradient.
+                assert isinstance(static_grad_inputs, tuple)
+                return tuple(
+                    b.detach() if b is not None else b for b in static_grad_inputs
+                )
+
+        def functionalized(*user_args):
+            # Runs the autograd function with inputs == all inputs to the graph that might require grad
+            # (explicit user args + module parameters)
+            # Assumes module params didn't change since capture.
+            flatten_user_args = _pytree.arg_tree_leaves(*user_args)
+            out = Graphed.apply(*(tuple(flatten_user_args) + module_params))
+            return _pytree.tree_unflatten(out, output_unflatten_spec)
+
+        return functionalized
+
+    # Put together the final graphed callables
+    ret = []
+    for i, func in enumerate(callables):
+        graphed = make_graphed_autograd_function(
+            fwd_graphs[i],
+            bwd_graphs[i],
+            per_callable_module_params[i],
+            per_callable_len_user_args[i],
+            per_callable_output_unflatten_spec[i],
+            per_callable_static_input_surfaces[i],
+            per_callable_static_outputs[i],
+            per_callable_static_grad_outputs[i],
+            per_callable_static_grad_inputs[i],
+        )
+
+        if isinstance(func, torch.nn.Module):
+
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+
+                return new_fwd
+
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            ret.append(func)
+        else:
+            ret.append(graphed)
+
+    if just_one_callable:
+        return ret[0]
+
+    return tuple(ret)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d25482419e635612855ed402fd02ef58709417
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/jiterator.py
@@ -0,0 +1,185 @@
+import re
+from typing import Callable, List
+
+import torch
+from torch import Tensor
+
+__all__: List[str] = []
+
+
+class _CodeParser:
+    def __init__(self, code_string: str):
+        optional_ws = r"\s*"
+        required_ws = r"\s+"
+        template_params = r"(?P<template_params>\<.+\>)"
+        return_type = r"(?P<return_type>\w+)"
+        function_name = r"(?P<function_name>\w+)"
+        function_params = r"(?P<function_params>\(.+\))"
+        function_body = r"(?P<function_body>\{.+\})"
+
+        pattern = (
+            optional_ws
+            + "template"
+            + optional_ws
+            + template_params
+            + optional_ws
+            + return_type
+            + required_ws
+            + function_name
+            + optional_ws
+            + function_params
+            + optional_ws
+            + function_body
+            + optional_ws
+        )
+
+        result = re.match(
+            pattern, code_string, re.DOTALL
+        )  # DOTALL for matching multiline
+
+        if result is None:
+            raise Exception(
+                f"Couldn't parse code, please check correctness:\n {code_string}"
+            )
+
+        self.template_params = result["template_params"]
+        self.return_type = result["return_type"]
+        self.function_name = result["function_name"]
+        self.function_params = result["function_params"]
+        self.function_body = result["function_body"]
+
+
+class _JittedFunction:
+    def __init__(
+        self, code_string: str, return_by_ref: bool, num_outputs: int, **kwargs
+    ):
+        self.code_string = code_string
+
+        assert (
+            return_by_ref or num_outputs == 1
+        ), "Return by value only works for single output. "
+        self.return_by_ref = return_by_ref
+        self.num_outputs = num_outputs
+
+        parsed_code = _CodeParser(code_string)
+        self.kernel_name = parsed_code.function_name
+
+        self.kwargs_dict = kwargs
+        self.is_cuda_available = torch.cuda.is_available()
+
+    def __call__(self, *tensors: Tensor, **kwargs):
+        # Jiterator follow torch.cuda's lazy initialization behavior
+        # Defer checking cuda's availability at the function invocation time
+        assert (
+            self.is_cuda_available
+        ), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
+
+        assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
+
+        expanded_kwargs = self.kwargs_dict.copy()
+        for key, value in kwargs.items():
+            if key in self.kwargs_dict:
+                expanded_kwargs[key] = value
+            else:
+                raise KeyError(f"{key} is not declared in function definition")
+
+        return torch._C._cuda_jiterator_compile_and_launch_kernel(
+            self.code_string,
+            self.kernel_name,
+            self.return_by_ref,
+            self.num_outputs,
+            tensors,
+            expanded_kwargs,
+        )
+
+
+def _create_jit_fn(code_string: str, **kwargs) -> Callable:
+    """
+    Create a jiterator-generated cuda kernel for an elementwise op.
+
+    The code string has to be a valid CUDA function that describes the computation for a single element. The code
+    string has to follow the c++ template pattern, as shown in the example below. This function will be inlined
+    into elementwise kernel template, and compiled on the fly. Compiled kernel will be cached in memory, as well as
+    local temp dir.
+
+    Jiterator-generated kernels accepts noncontiguous tensors, and supports broadcasting and type promotion.
+
+    Args:
+        code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return by value.
+        kwargs (Dict, optional): Keyword arguments for generated function
+
+    Example::
+
+        code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
+        jitted_fn = create_jit_fn(code_string, alpha=1.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b, alpha=3.14)
+
+    code_string also allows multiple function definitions, and the last function will be treated as the entry function.
+
+    Example::
+
+        code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
+        code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
+        jitted_fn = create_jit_fn(code_string, val=0.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b)  # using default val=0.0
+
+    Jiterator can be used together with python registration to override an operator's cuda kernel.
+    Following example is overriding gelu's cuda kernel with relu.
+
+    Example::
+
+        code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
+        my_gelu = create_jit_fn(code_string)
+        my_lib = torch.library.Library("aten", "IMPL")
+        my_lib.impl('aten::gelu', my_gelu, "CUDA")
+        # torch.nn.GELU and torch.nn.function.gelu are now overridden
+        a = torch.rand(3, device='cuda')
+        torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        This API only supports up to 8 inputs and 1 output
+
+    .. warning::
+        All input tensors must live in CUDA device
+    """
+    return _JittedFunction(code_string, return_by_ref=False, num_outputs=1, **kwargs)
+
+
+def _create_multi_output_jit_fn(
+    code_string: str, num_outputs: int, **kwargs
+) -> Callable:
+    """
+    Create a jiterator-generated cuda kernel for an elementwise op that supports returning one or more outputs.
+
+    Args:
+        code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return value by reference.
+        num_outputs(int): number of outputs return by the kernel
+        kwargs (Dict, optional): Keyword arguments for generated function
+
+    Example::
+
+        code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
+        jitted_fn = create_jit_fn(code_string, alpha=1.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b, alpha=3.14)
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        This API only supports up to 8 inputs and 8 outputs
+    """
+    return _JittedFunction(
+        code_string, return_by_ref=True, num_outputs=num_outputs, **kwargs
+    )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..05751ab5f87b7042426454e83541d3bebe1861fc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/nccl.py
@@ -0,0 +1,137 @@
+import collections
+import warnings
+from typing import Optional, Sequence, Union
+
+import torch.cuda
+
+
+__all__ = ["all_reduce", "reduce", "broadcast", "all_gather", "reduce_scatter"]
+
+SUM = 0  # ncclRedOp_t
+
+
+def is_available(tensors):
+    if not hasattr(torch._C, "_nccl_all_reduce"):
+        warnings.warn("PyTorch is not compiled with NCCL support")
+        return False
+
+    devices = set()
+    for tensor in tensors:
+        if tensor.is_sparse:
+            return False
+        if not tensor.is_contiguous():
+            return False
+        if not tensor.is_cuda:
+            return False
+        device = tensor.get_device()
+        if device in devices:
+            return False
+        devices.add(device)
+
+    return True
+
+
+def version():
+    ver = torch._C._nccl_version()
+    major = ver >> 32
+    minor = (ver >> 16) & 65535
+    patch = ver & 65535
+    suffix = torch._C._nccl_version_suffix().decode("utf-8")
+    if suffix == "":
+        return (major, minor, patch)
+    else:
+        return (major, minor, patch, suffix)
+
+
+def unique_id():
+    return torch._C._nccl_unique_id()
+
+
+def init_rank(num_ranks, uid, rank):
+    return torch._C._nccl_init_rank(num_ranks, uid, rank)
+
+
+def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
+    if not isinstance(inputs, collections.abc.Container) or isinstance(
+        inputs, torch.Tensor
+    ):
+        raise TypeError("Inputs should be a collection of tensors")
+
+
+def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
+    _check_sequence_type(inputs)
+    if outputs is None:
+        outputs = inputs
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_reduce(inputs, outputs, op, streams, comms)
+
+
+# `output` used to be `outputs`, taking in a list of tensors. So we have two
+# arguments for BC reasons.
+def reduce(
+    inputs: Sequence[torch.Tensor],
+    output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None,
+    root: int = 0,
+    op: int = SUM,
+    streams: Optional[Sequence[torch.cuda.Stream]] = None,
+    comms=None,
+    *,
+    outputs: Optional[Sequence[torch.Tensor]] = None,
+) -> None:
+    _check_sequence_type(inputs)
+    _output: torch.Tensor
+    if outputs is not None:
+        if output is not None:
+            raise ValueError(
+                "'output' and 'outputs' can not be both specified. 'outputs' is deprecated in "
+                "favor of 'output', taking in a single output tensor. The signature of reduce is: "
+                "reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None)."
+            )
+        else:
+            warnings.warn(
+                "nccl.reduce with an output tensor list is deprecated. "
+                "Please specify a single output tensor with argument 'output' instead instead."
+            )
+            _output = outputs[root]
+    elif not isinstance(output, torch.Tensor) and isinstance(
+        output, collections.abc.Sequence
+    ):
+        # User called old API with positional arguments of list of output tensors.
+        warnings.warn(
+            "nccl.reduce with an output tensor list is deprecated. "
+            "Please specify a single output tensor."
+        )
+        _output = output[root]
+    else:
+        _output = inputs[root] if output is None else output
+    torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
+
+
+def broadcast(
+    inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None
+) -> None:
+    _check_sequence_type(inputs)
+    torch._C._nccl_broadcast(inputs, root, streams, comms)
+
+
+def all_gather(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_gather(inputs, outputs, streams, comms)
+
+
+def reduce_scatter(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    op: int = SUM,
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..51c8aa46f714b6a9fd30857c9edb575614d52420
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/profiler.py
@@ -0,0 +1,61 @@
+import contextlib
+import tempfile
+
+import torch
+from . import check_error, cudart
+
+__all__ = ["init", "start", "stop", "profile"]
+
+DEFAULT_FLAGS = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+def init(output_file, flags=None, output_mode="key_value"):
+    rt = cudart()
+    if not hasattr(rt, "cudaOutputMode"):
+        raise AssertionError("HIP does not support profiler initialization!")
+    if (
+        hasattr(torch.version, "cuda")
+        and torch.version.cuda is not None
+        and int(torch.version.cuda.split(".")[0]) >= 12
+    ):
+        # Check https://github.com/pytorch/pytorch/pull/91118
+        # cudaProfilerInitialize is no longer needed after CUDA 12
+        raise AssertionError("CUDA12+ does not need profiler initialization!")
+    flags = DEFAULT_FLAGS if flags is None else flags
+    if output_mode == "key_value":
+        output_mode_enum = rt.cudaOutputMode.KeyValuePair
+    elif output_mode == "csv":
+        output_mode_enum = rt.cudaOutputMode.CSV
+    else:
+        raise RuntimeError(
+            "supported CUDA profiler output modes are: key_value and csv"
+        )
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+        f.write(b"\n".join(f.encode("ascii") for f in flags))
+        f.flush()
+        check_error(rt.cudaProfilerInitialize(f.name, output_file, output_mode_enum))
+
+
+def start():
+    check_error(cudart().cudaProfilerStart())
+
+
+def stop():
+    check_error(cudart().cudaProfilerStop())
+
+
+@contextlib.contextmanager
+def profile():
+    try:
+        start()
+        yield
+    finally:
+        stop()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf33114d17bd1867dfc5e5bb9179670291878a2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/random.py
@@ -0,0 +1,179 @@
+from typing import Iterable, List, Union
+
+import torch
+from .. import Tensor
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "cuda") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.cuda.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "cuda"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.cuda.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.cuda.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.cuda.default_generators[idx]
+    return default_generator.initial_seed()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37a34118d2d8f73437dee54337a666df1b99a09
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/sparse.py
@@ -0,0 +1 @@
+# The Tensor classes are added to this module by python_tensor.cpp
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d541f4e2879ace0cc78766d8a8c1795b0bc4a1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/cuda/streams.py
@@ -0,0 +1,241 @@
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_CudaStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_CudaStreamBase"] = _dummy_type("_CudaStreamBase")
+    torch._C.__dict__["_CudaEventBase"] = _dummy_type("_CudaEventBase")
+
+
+class Stream(torch._C._CudaStreamBase, _StreamBase):
+    r"""Wrapper around a CUDA stream.
+
+    A CUDA stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.cuda.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event):
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.cuda.Event): an event to wait for.
+
+        .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see
+           `CUDA Stream documentation`_ for more info.
+
+           This function returns without waiting for :attr:`event`: only future
+           operations are affected.
+
+        .. _CUDA Stream documentation:
+           https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream):
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+
+        .. note:: This function returns without waiting for currently enqueued
+           kernels in :attr:`stream`: only future operations are affected.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.cuda.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self):
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self):
+        r"""Wait for all the kernels in this stream to complete.
+
+        .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
+           `CUDA Stream documentation`_ for more info.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.cuda_stream)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.cuda_stream, self.device))
+
+    def __repr__(self):
+        return f"<torch.cuda.Stream device={self.device} cuda_stream={self.cuda_stream:#x}>"
+
+
+class ExternalStream(Stream):
+    r"""Wrapper around an externally allocated CUDA stream.
+
+    This class is used to wrap streams allocated in other libraries in order
+    to facilitate data exchange and multi-library interactions.
+
+    .. note:: This class doesn't manage the stream life-cycle, it is the user
+       responsibility to keep the referenced stream alive while this class is
+       being used.
+
+    Args:
+        stream_ptr(int): Integer representation of the `cudaStream_t` value.
+            allocated externally.
+        device(torch.device or int, optional): the device where the stream
+            was originally allocated. if device is specified incorrectly,
+            subsequent launches using this stream may fail.
+    """
+
+    def __new__(cls, stream_ptr, device=None, **kwargs):
+        with torch.cuda.device(device):
+            return super().__new__(cls, stream_ptr=stream_ptr, **kwargs)
+
+
+class Event(torch._C._CudaEventBase, _EventBase):
+    r"""Wrapper around a CUDA event.
+
+    CUDA events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize CUDA
+    streams.
+
+    The underlying CUDA events are lazily initialized when the event is first
+    recorded or exported to another process. After creation, only streams on the
+    same device may record the event. However, streams on any device can wait on
+    the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """
+
+    def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
+        return super().__new__(
+            cls,
+            enable_timing=enable_timing,
+            blocking=blocking,
+            interprocess=interprocess,
+        )
+
+    @classmethod
+    def from_ipc_handle(cls, device, handle):
+        r"""Reconstruct an event from an IPC handle on the given device."""
+        return super().from_ipc_handle(device, handle)
+
+    def record(self, stream=None):
+        r"""Record the event in a given stream.
+
+        Uses ``torch.cuda.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None):
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.cuda.current_stream()`` if no stream is specified.
+
+        .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        super().wait(stream)
+
+    def query(self):
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self):
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+
+         .. note:: This is a wrapper around ``cudaEventSynchronize()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        super().synchronize()
+
+    def ipc_handle(self):
+        r"""Return an IPC handle of this event.
+
+        If not recorded yet, the event will use the current device.
+        """
+        return super().ipc_handle()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.cuda_event)
+
+    def __repr__(self):
+        if self.cuda_event:
+            return f"<torch.cuda.Event {self._as_parameter_.value:#x}>"
+        else:
+            return "<torch.cuda.Event uninitialized>"
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d705a9fb249d7cdce9053464a2e60a05465676d
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ffc0c68d05004bf90d8fc1df6986361e58bbca7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53c8e3715798203db22746633ead7109dca0f29e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/graph.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..457c770debb164ef3d4f97112c4decb46228cfcd
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/interpreter.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df824888a7ee5470401861115fb55164a2f50431
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/operator_schemas.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e481f445b31589a93acf94bc403c2ee41fce58ae
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h
new file mode 100644
index 0000000000000000000000000000000000000000..effdd469d19b91316aa21ae99d43055f49c950eb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ATen.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#if !defined(_MSC_VER) && __cplusplus < 201703L
+#error C++17 or later compatible compiler is required to use ATen.
+#endif
+
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DimVector.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Formatting.h>
+#include <ATen/Functions.h>
+#include <ATen/NamedTensor.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/Version.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Scalar.h>
+#include <ATen/core/UnsafeFromTH.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+
+// TODO: try to remove this
+// There is some back story, see https://github.com/pytorch/pytorch/issues/48684
+#include <ATen/NativeFunctions.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h
new file mode 100644
index 0000000000000000000000000000000000000000..0275ef099b03d714b916b9d0d09c4827724bf58c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/AccumulateType.h
@@ -0,0 +1,153 @@
+#pragma once
+#include <ATen/Config.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+
+// Defines the accumulation type for a scalar type.
+// Example:
+//   using accscalar_t = acc_type<scalar_t, /*is_cuda*/true>;
+//
+// Accumulation types are an important concept in numeric computing
+// because you frequently want to perform intermediate computations
+// at a higher precision than the input and output precision, to avoid
+// compounding internal rounding errors.  Accumulation is the most
+// well-known intermediate computation (it is of great importance for
+// sum reduction and matrix multiply, for example), but in PyTorch
+// acc_type ends up getting used for all sorts of other intermediate
+// computations, so it perhaps would be more accurately (ahem) called an
+// "accurate" type.  acc_type is especially important for reduced
+// precision operations like float16 and bfloat16, where relatively
+// benign looking inputs can easily end up overflowing/underflowing.
+//
+// acc_type is parametrized by whether or not you are running on CUDA
+// or not, because on CUDA double precision operations are expensive
+// and so by default, we don't actually want to use double as an
+// acc_type on CUDA.  A lot of things are typed out below, but
+// basically, the table is generated by a few rules:
+//
+//  If bool:
+//      Use 'bool' as acc_type.
+//  If floating point:
+//      If CUDA, use 'float' as acc_type (unless scalar_t is double),
+//      otherwise (CPU) use 'double'
+//  If integral:
+//      Use 'int64_t' as acc_type
+//
+// You're not forced to use this template; if you happen to know
+// something specific about your use case, you can specify your own
+// desired behavior.  This template, however, will give you a reasonable
+// default that will work for all dtypes supported in PyTorch.
+
+#if defined(__CUDACC__)
+#include <cuda.h>
+#include <cuda_fp16.h>
+#elif defined(__HIPCC__)
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+namespace at {
+
+template <typename T, c10::DeviceType D>
+struct AccumulateTypeDevice {};
+
+template <typename T, bool>
+struct AccumulateType {};
+
+template <typename T>
+struct AccumulateType<T, false> {
+  using type = typename AccumulateTypeDevice<T, c10::DeviceType::CPU>::type;
+};
+
+template <typename T>
+struct AccumulateType<T, true> {
+  using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
+};
+
+template <typename T, c10::DeviceType device>
+using acc_type_device = typename AccumulateTypeDevice<T, device>::type;
+
+template <typename T, bool is_cuda>
+using acc_type = typename AccumulateType<T, is_cuda>::type;
+
+#define ACC_TYPE(t, acc_t, device_type)         \
+  template <>                                   \
+  struct AccumulateTypeDevice<t, device_type> { \
+    using type = acc_t;                         \
+  };
+#define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
+#define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
+#define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)
+
+MPS_ACC_TYPE(BFloat16, float);
+MPS_ACC_TYPE(Half, float);
+MPS_ACC_TYPE(Float8_e5m2, float);
+MPS_ACC_TYPE(Float8_e4m3fn, float);
+MPS_ACC_TYPE(Float8_e5m2fnuz, float);
+MPS_ACC_TYPE(Float8_e4m3fnuz, float);
+MPS_ACC_TYPE(float, float);
+MPS_ACC_TYPE(double, float);
+MPS_ACC_TYPE(int8_t, int64_t);
+MPS_ACC_TYPE(uint8_t, int64_t);
+MPS_ACC_TYPE(char, int64_t);
+MPS_ACC_TYPE(int16_t, int64_t);
+MPS_ACC_TYPE(int32_t, int64_t);
+MPS_ACC_TYPE(int64_t, int64_t);
+MPS_ACC_TYPE(bool, bool);
+MPS_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+MPS_ACC_TYPE(c10::complex<float>, c10::complex<float>);
+MPS_ACC_TYPE(c10::complex<double>, c10::complex<float>);
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+CUDA_ACC_TYPE(half, float);
+#endif
+CUDA_ACC_TYPE(BFloat16, float);
+CUDA_ACC_TYPE(Half, float);
+CUDA_ACC_TYPE(Float8_e5m2, float);
+CUDA_ACC_TYPE(Float8_e4m3fn, float);
+CUDA_ACC_TYPE(Float8_e5m2fnuz, float);
+CUDA_ACC_TYPE(Float8_e4m3fnuz, float);
+CUDA_ACC_TYPE(float, float);
+CUDA_ACC_TYPE(double, double);
+CUDA_ACC_TYPE(int8_t, int64_t);
+CUDA_ACC_TYPE(uint8_t, int64_t);
+CUDA_ACC_TYPE(char, int64_t);
+CUDA_ACC_TYPE(int16_t, int64_t);
+CUDA_ACC_TYPE(int32_t, int64_t);
+CUDA_ACC_TYPE(int64_t, int64_t);
+CUDA_ACC_TYPE(bool, bool);
+CUDA_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+CUDA_ACC_TYPE(c10::complex<float>, c10::complex<float>);
+CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);
+
+CPU_ACC_TYPE(BFloat16, float);
+CPU_ACC_TYPE(Half, float);
+CPU_ACC_TYPE(Float8_e5m2, float);
+CPU_ACC_TYPE(Float8_e4m3fn, float);
+CPU_ACC_TYPE(Float8_e5m2fnuz, float);
+CPU_ACC_TYPE(Float8_e4m3fnuz, float);
+CPU_ACC_TYPE(float, double);
+CPU_ACC_TYPE(double, double);
+CPU_ACC_TYPE(int8_t, int64_t);
+CPU_ACC_TYPE(uint8_t, int64_t);
+CPU_ACC_TYPE(char, int64_t);
+CPU_ACC_TYPE(int16_t, int64_t);
+CPU_ACC_TYPE(int32_t, int64_t);
+CPU_ACC_TYPE(int64_t, int64_t);
+CPU_ACC_TYPE(bool, bool);
+CPU_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+CPU_ACC_TYPE(c10::complex<float>, c10::complex<double>);
+CPU_ACC_TYPE(c10::complex<double>, c10::complex<double>);
+
+TORCH_API c10::ScalarType toAccumulateType(
+    c10::ScalarType type,
+    c10::DeviceType device);
+TORCH_API c10::ScalarType toAccumulateType(c10::ScalarType type, bool is_cuda);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..9651469e190085d913ba9b5d1ca02085886fc4e1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Backend.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/core/Backend.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf621f34cc63735d7f7557f48146bb76467b8afc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CPUFixedAllocator.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+
+// This file creates a fake allocator that just throws exceptions if
+// it is actually used.
+
+// state passed to the allocator is the std::function<void(void*)> called
+// when the blob is release by ATen
+
+namespace at {
+
+static cpu_fixed_malloc(void*, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_realloc(void*, void*, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_free(void* state, void* allocation) {
+  auto on_release = static_cast<std::function<void(void*)>*>(state);
+  (*on_release)(allocation);
+  delete on_release;
+}
+
+static Allocator CPU_fixed_allocator = {
+    cpu_fixed_malloc,
+    cpu_fixed_realloc,
+    cpu_fixed_free};
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e25112e7d4490096e6340184a3a4813511f93b6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CollapseDims.h
@@ -0,0 +1,94 @@
+#include <c10/util/Exception.h>
+#include <utility>
+
+namespace at {
+
+/*
+[collapse dims] Updates sizes, and strides to reflect a "collapse" of
+the info, possibly excluding the optional excludeDim. A "collapsed" version
+of the info is the fewest dims that order the tensor's elements in the same
+way as the original info. If excludeDim is specified, the collapse is the
+fewest dims that order the tensor's elements as the original and preserve the
+excluded dimension, unless the tensor collapses to a point.
+
+This function returns a pair of values.
+
+1) The (new) index of the preserved dimension if excludeDim is
+specified. 0 if the tensor is collapsed to a point. -1
+otherwise.
+
+2) The new number of dimensions.
+*/
+template <typename T>
+inline std::pair<int64_t, int64_t> collapse_dims(
+    T* sizes,
+    T* strides,
+    int64_t dims,
+    const int excludeDim = -1) {
+  TORCH_CHECK(
+      excludeDim >= -1 && excludeDim < dims,
+      "expected excluded dim between -1 and dims - 1");
+
+  int64_t stopDim = (excludeDim == -1) ? dims : excludeDim;
+  int64_t newIndex = -1;
+  int64_t oldIndex = 0;
+  int64_t remappedExcludedDim = -1;
+
+  while (oldIndex < dims) {
+    // Finds a dimension to collapse into
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      ++oldIndex;
+      break;
+    }
+
+    // Collapses dims
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+
+      if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
+        sizes[newIndex] *= sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      } else {
+        ++newIndex;
+        sizes[newIndex] = sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      }
+    }
+
+    // Handles excludeDim being set (oldIndex == excludeDim)
+    if (oldIndex != dims) {
+      // Preserves excluded dimension
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      remappedExcludedDim = newIndex;
+
+      // Restarts iteration after excludeDim
+      ++oldIndex;
+      stopDim = dims;
+    }
+  }
+
+  // Handles special case of all dims size 1
+  if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    return std::pair<int64_t, int64_t>(0, 1);
+  }
+
+  dims = newIndex + 1;
+  return std::pair<int64_t, int64_t>(remappedExcludedDim, dims);
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..fde0a471ac0135f1dcb55f78e10d0818c5cff2e2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CompositeImplicitAutogradFunctions_inl.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..90ffa6b1eb4a9cc5d64851784113a739e385a77e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeimplicitautogradnestedtensor_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/randn_like_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_as_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h>
+
+
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..35ba9f5284376f0d873b33b3f75f7d0608b0dae7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h
@@ -0,0 +1,808 @@
+#pragma once
+
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/complex.h>
+#include <c10/util/string_view.h>
+
+#ifdef __CUDACC__
+#include <cuda.h> // For CUDA_VERSION
+#endif
+
+#ifdef TEMPLATE_SELECTIVE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#else
+namespace at {
+/**
+ * The method should_include_kernel_dtype() returns true/false
+ * based on whether the switching code for a specific dtype should be
+ * included based on build time constants generated from tracing model
+ * execution. This method will be implmeneted via code-generation and
+ * included in this file when code-gen is ready.
+ */
+inline constexpr bool should_include_kernel_dtype(
+    const char* /*kernel_tag_str*/,
+    at::ScalarType /*scalar_type*/
+) {
+  return true;
+}
+} // namespace at
+#endif
+
+/**
+ * In the Facebook internal build (using BUCK), this macro is enabled by
+ * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer
+ * binary.
+ */
+#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE
+namespace at {
+namespace detail {
+TORCH_API void record_kernel_function_dtype(std::string name);
+}
+} // namespace at
+
+#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) \
+  at::detail::record_kernel_function_dtype(           \
+      std::string(NAME) + "$" + toString(enum_type));
+#else
+#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type)
+#endif
+
+#define AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type)   \
+  do {                                                \
+    if constexpr (!at::should_include_kernel_dtype(   \
+                      at_dispatch_name, enum_type)) { \
+      AT_ERROR(                                       \
+          "dtype '",                                  \
+          toString(enum_type),                        \
+          "' not selected for kernel tag ",           \
+          at_dispatch_name);                          \
+    }                                                 \
+  } while (0)
+
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)           \
+  case enum_type: {                                                     \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                        \
+    using HINT C10_UNUSED = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
+    return __VA_ARGS__();                                               \
+  }
+
+#define AT_DISPATCH_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
+
+#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...)            \
+  case enum_type: {                                                   \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
+    using scalar_t = scalar_type;                                     \
+    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
+    const auto& SCALAR_TYPE C10_UNUSED = enum_type;                   \
+    const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \
+    return __VA_ARGS__();                                             \
+  }
+
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                           \
+    enum_type, scalar_type, bitwidth, qmin, qmax, ...)                \
+  case enum_type: {                                                   \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
+    using scalar_t = scalar_type;                                     \
+    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
+    const auto& SCALAR_TYPE C10_UNUSED = enum_type;                   \
+    const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \
+    C10_UNUSED int bit_width = bitwidth;                              \
+    C10_UNUSED int64_t quant_min = qmin;                              \
+    C10_UNUSED int64_t quant_max = qmax;                              \
+    return __VA_ARGS__();                                             \
+  }
+
+namespace detail {
+
+inline at::ScalarType scalar_type(at::ScalarType s) {
+  return s;
+}
+
+C10_DEPRECATED_MESSAGE(
+    "passing at::DeprecatedTypeProperties to an AT_DISPATCH macro is deprecated, "
+    "pass an at::ScalarType instead")
+inline at::ScalarType scalar_type(const at::DeprecatedTypeProperties& t) {
+  return t.scalarType();
+}
+
+C10_DEPRECATED_MESSAGE(
+    "AT_DISPATCH_ALL_TYPES_AND_HALF is deprecated, "
+    "use AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ...) instead")
+inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF() {}
+
+C10_DEPRECATED_MESSAGE(
+    "AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX is deprecated, "
+    "use AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, ...) "
+    "instead")
+inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
+
+} // namespace detail
+
+// The AT_DISPATCH_* family of macros provides the ability to
+// conveniently generate specializations of a kernel over all of the
+// dtypes we care about in PyTorch.  We call it "dispatch" because
+// we are "dispatching" to the correct, dtype-specific kernel.
+//
+// A standard usage looks like:
+//
+//      AT_DISPATCH_ALL_TYPES(self.scalar_type(), "op_name", [&] {
+//          // Your code here, with 'scalar_t' now defined to
+//          // be the dtype in question
+//      });
+//
+// There are many variations of this macro, so it's important to
+// understand exactly /which/ dtypes you want to get instantiated, as
+// well as what the "default" set is.
+//
+// The default set of dtypes that are instantiated (e.g., by
+// AT_DISPATCH_ALL_TYPES) are floating point types (float, double),
+// and integral types (int32_t, int64_t, int16_t, int8_t, uint8_t),
+// but NOT booleans (bool), half-precision floats (Half) or
+// complex number (c10::complex<float>, c10::complex<double>).
+// This "cut" is somewhat historical (the default types are the
+// ones that TH historically supported), but it also reflects the
+// fact that the non-default types are "poorly" behaved (booleans
+// are NOT integers mod 2, half precision operations ~essentially
+// don't exist on CPU, complex numbers are an experimental application).
+//
+// Here are the questions you should generally ask to decide which
+// dispatch you want:
+//
+// 1. Is this an integral or floating point specific operation?
+//    (If so, you'll want one of the FLOATING or INTEGRAL macros.)
+//
+// 2. Should half be supported?  (If you're on CPU, the answer is almost
+//    definitely no.  If you do want support, use one of the AND_HALF
+//    macros)
+//
+// Much rarer situations:
+//
+// 3. Should bool be supported?  (You often have to write your kernel
+//    differently if arithmetic operations are involved.)  If so,
+//    Use AT_DISPATCH_ALL_TYPES_AND along with ScalarType::Bool
+//
+// 4. Should complex be supported?  The answer is almost always no,
+//    unless you are working on "generic" code that should work on
+//    all dtypes.
+//
+// Parameters:
+// -----------
+//
+// 1. The NAME argument is a "tag" that is used to trace and then
+//    conditionally compile fragments of the case statements such
+//    that the kernel functions are specialized only for the dtypes
+//    that are needed. The NAME parameter *must* be a build time
+//    const char* (can't be std::string, etc...)
+//
+// Please ensure that the NAME is unique for every implementation
+// or you run the risk of over-including code for the kernel
+// functions. There is no risk of missing out on any code, so
+// it's mostly a risk of a Type-2 error, and not a Type-1 error.
+//
+// Switch-like syntax:
+// -------------------
+// There is also a switch-case like syntax which is useful if a kernel
+// needs to be specialized for particular scalar types
+//
+//      AT_DISPATCH_SWITCH(self.scalar_type(), "op_name",
+//          AT_DISPATCH_CASE_INTEGRAL_TYPES([&] {
+//            op_integral<scalar_t>(iter);
+//          })
+//          AT_DISPATCH_CASE_FLOATING_TYPES([&] {
+//            op_floating<scalar_t>(iter);
+//          })
+//          AT_DISPATCH_CASE(kBool, [&] {
+//            op_bool(iter);
+//          })
+//      );
+//
+// For each AT_DISPATCH_FOO macro, there is a corresponding
+// AT_DISPATCH_CASE_FOO macro which can be used inside of an
+// AT_DISPATCH_SWITCH block.
+
+// NB: the the_type variable is not used, but we have kept it for
+// backwards compatibility.  It's probably not used by anyone though;
+// but we're just being safe (and it doesn't hurt.)  Note we must
+// use it to shut up warnings about unused store.
+
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    constexpr const char* at_dispatch_name = NAME;                          \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
+    switch (_st) {                                                          \
+      __VA_ARGS__                                                           \
+      default:                                                              \
+        AT_ERROR(                                                           \
+            '"',                                                            \
+            at_dispatch_name,                                               \
+            "\" not implemented for '",                                     \
+            toString(_st),                                                  \
+            "'");                                                           \
+    }                                                                       \
+  }()
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(...)   \
+  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                        \
+      TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(...)  \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define AT_DISPATCH_REDUCED_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE, NAME, AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                                \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND2(       \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                          \
+      TYPE,                                    \
+      NAME,                                    \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND2(    \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND3(   \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)  \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND3(                    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND3(                 \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
+  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
+
+#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_COMPLEX_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)           \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                           \
+      TYPE, NAME, AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)                \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(    \
+    SCALARTYPE, TYPE, NAME, ...)                        \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1( \
+          SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2(  \
+    SCALARTYPE1, SCALARTYPE2, ...)                         \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(    \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...)          \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2( \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3(  \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...)            \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3(     \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)   \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND4(                     \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(                  \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5(                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)                \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND5(    \
+    SCALARTYPE1,                                        \
+    SCALARTYPE2,                                        \
+    SCALARTYPE3,                                        \
+    SCALARTYPE4,                                        \
+    SCALARTYPE5,                                        \
+    TYPE,                                               \
+    NAME,                                               \
+    ...)                                                \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5( \
+          SCALARTYPE1,                                  \
+          SCALARTYPE2,                                  \
+          SCALARTYPE3,                                  \
+          SCALARTYPE4,                                  \
+          SCALARTYPE5,                                  \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6(  \
+    SCALARTYPE1,                                           \
+    SCALARTYPE2,                                           \
+    SCALARTYPE3,                                           \
+    SCALARTYPE4,                                           \
+    SCALARTYPE5,                                           \
+    SCALARTYPE6,                                           \
+    ...)                                                   \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND6(    \
+    SCALARTYPE1,                                        \
+    SCALARTYPE2,                                        \
+    SCALARTYPE3,                                        \
+    SCALARTYPE4,                                        \
+    SCALARTYPE5,                                        \
+    SCALARTYPE6,                                        \
+    TYPE,                                               \
+    NAME,                                               \
+    ...)                                                \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6( \
+          SCALARTYPE1,                                  \
+          SCALARTYPE2,                                  \
+          SCALARTYPE3,                                  \
+          SCALARTYPE4,                                  \
+          SCALARTYPE5,                                  \
+          SCALARTYPE6,                                  \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_INTEGRAL_TYPES(...)          \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)
+
+#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_INTEGRAL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES(...)        \
+  AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_TYPES(...)                      \
+  AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__)   \
+  AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__) \
+  AT_DISPATCH_CASE_QINT(at::kQInt32, at::qint32, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                           \
+      TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_BYTE_TYPES(...)               \
+  AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__) \
+  AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_BYTE_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(...)                     \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQInt8, at::qint8, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__) \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt8, at::quint8, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)       \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQInt32,                                                        \
+      at::qint32,                                                         \
+      CHAR_BIT * sizeof(int),                                             \
+      INT_MIN,                                                            \
+      INT_MAX,                                                            \
+      __VA_ARGS__)                                                        \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt4x2, at::quint4x2, 4, 0, 15, __VA_ARGS__)                 \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt2x4, at::quint2x4, 2, 0, 3, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                        \
+      TYPE, NAME, AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)           \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                      \
+      TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                          \
+      TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                      \
+      TYPE,                                                                \
+      NAME,                                                                \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                         \
+      TYPE,                                                                   \
+      NAME,                                                                   \
+      AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2(  \
+    SCALARTYPE1, SCALARTYPE2, ...)                    \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(    \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...)     \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2( \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND3(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)       \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND3(                         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_ALL_TYPES_AND3(                      \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3(  \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...)       \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3(          \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)        \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                          \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(                       \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5(                      \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)                     \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND5(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    SCALARTYPE7,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND7(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    SCALARTYPE7,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          SCALARTYPE7,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    SCALARTYPE7,                                      \
+    SCALARTYPE8,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE8, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND8(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    SCALARTYPE7,                                   \
+    SCALARTYPE8,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          SCALARTYPE7,                             \
+          SCALARTYPE8,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_BIT_TYPES(...)                  \
+  AT_DISPATCH_CASE(at::ScalarType::Bits1x8, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits2x4, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits4x2, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits8, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Bits16, __VA_ARGS__)
+
+#define AT_DISPATCH_BIT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_BIT_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_INDEX_TYPES(TYPE, NAME, ...)     \
+  AT_DISPATCH_SWITCH(                                \
+      TYPE,                                          \
+      NAME,                                          \
+      AT_PRIVATE_CASE_TYPE_USING_HINT(               \
+          at::ScalarType::Int, index_t, __VA_ARGS__) \
+          AT_PRIVATE_CASE_TYPE_USING_HINT(           \
+              at::ScalarType::Long, index_t, __VA_ARGS__))
+
+// ----------------------------------------------------------------------------
+// DEPRECATED MACROS, DON'T USE THESE
+// ----------------------------------------------------------------------------
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \
+  detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF();  \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_ALL_TYPES_AND(at::ScalarType::Half, __VA_ARGS__))
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0764834c02fdea27dd56cd08a283f772bcd7938
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dispatch_v2.h
@@ -0,0 +1,186 @@
+#include <ATen/Dispatch.h>
+
+// This is a new implementation of the AT_DISPATCH macro family from
+// ATen/Dispatch.h
+//
+// The intended usage is:
+//
+//  ScalarType scalar_type;
+//
+//  AT_DISPATCH_V2(
+//    scalar_type,
+//    "debug string",
+//    AT_WRAP([&] {
+//      ... code to specialize with scalar_t ...
+//    }),
+//    kHalf,
+//    AT_EXPAND(AT_ALL_TYPES),
+//    ... as many types arguments as needed ...
+//  )
+//
+// For example, given an old style:
+//
+//  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+//    kComplexHalf,
+//    kHalf,
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    [&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }
+//  )
+//
+// You now write:
+//
+//  AT_DISPATCH_V2(
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    AT_WRAP([&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }),
+//    AT_EXPAND(AT_ALL_TYPES),
+//    AT_EXPAND(AT_COMPLEX_TYPES),
+//    kComplexHalf,
+//    kHalf,
+//  )
+//
+// Notably, it sports the following improvements:
+//
+//  - It is not necessary to specify the arity (e.g.,
+//    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3,4,...})
+//    when using the macro
+//
+//  - It is not necessary to specify each dtype individually; if
+//    there is a set of related dtypes and you want to dispatch
+//    over all of them, you can simply say, e.g., AT_EXPAND(AT_INTEGRAL_TYPES)
+//    in your argument list.
+//
+// However, you must remember to wrap the payload body in AT_WRAP, or commas
+// inside your lambda will be improperly handled.  Furthermore, if you more
+// entries to ScalarType than can be supported by this macro, it will fail
+// with an obscure error (due to attempting to concatenate AT_AP with
+// something that is not a number).
+//
+// The implementation strategy is to use the count arguments trick
+// (e.g., as described in https://stackoverflow.com/a/2124385/23845)
+// to discover how many dtypes have been passed, and then dispatch to a
+// hand-written macro for each arity that applies as many DISPATCH_CASE as
+// necessary.  The hand-written macros can be regenerated for other arities
+// with the script below.
+//
+// There is some delicacy in the implementation in controlling when
+// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
+// relied on GPT4 to help me get it right.
+
+// Public API macros
+
+// See documentation above
+#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
+
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+#define AT_WRAP(...) __VA_ARGS__
+
+#define AT_FLOAT8_TYPES                                          \
+  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
+      c10::kFloat8_e4m3fnuz
+
+#define AT_INTEGRAL_TYPES \
+  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
+#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
+#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
+#define AT_INTEGRAL_TYPES_V2 \
+  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
+#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
+#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
+// NB: not *actually* all types
+#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+#define AT_ALL_TYPES_AND_COMPLEX \
+  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
+
+// Helper macros
+
+#define AT_AP_VAR(N, T, ...) \
+  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
+#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
+#define AT_CONCAT_AUX(a, b) a##b
+#define AT_EXPAND(X) X
+
+// Ensure we never have too many scalar types for the expansion here to
+// support.  To bump this, you must regenerate the macros below.
+static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 45);
+
+// Python code to regenerate generate code below:
+#if 0
+
+num_args = 45
+
+nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
+args = ', '.join(f'_{i}' for i in range(1, num_args+1))
+
+print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
+print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
+
+for i in range(1, num_args+1):
+    args = ', '.join(f'_{i}' for i in range(1, i+1))
+    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
+    print(f'#define AT_AP{i}(N, {args}) {cases}')
+
+#endif
+
+// Begin generated code
+// clang-format off
+
+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, N, ...) N
+#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
+#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
+#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
+#define AT_AP4(N, _1, _2, _3, _4) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N)
+#define AT_AP5(N, _1, _2, _3, _4, _5) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N)
+#define AT_AP6(N, _1, _2, _3, _4, _5, _6) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N)
+#define AT_AP7(N, _1, _2, _3, _4, _5, _6, _7) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N)
+#define AT_AP8(N, _1, _2, _3, _4, _5, _6, _7, _8) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N)
+#define AT_AP9(N, _1, _2, _3, _4, _5, _6, _7, _8, _9) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N)
+#define AT_AP10(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N)
+#define AT_AP11(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N)
+#define AT_AP12(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N)
+#define AT_AP13(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N)
+#define AT_AP14(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N)
+#define AT_AP15(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N)
+#define AT_AP16(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N)
+#define AT_AP17(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N)
+#define AT_AP18(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N)
+#define AT_AP19(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N)
+#define AT_AP20(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N)
+#define AT_AP21(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N)
+#define AT_AP22(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N)
+#define AT_AP23(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N)
+#define AT_AP24(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N)
+#define AT_AP25(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N)
+#define AT_AP26(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N)
+#define AT_AP27(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N)
+#define AT_AP28(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N)
+#define AT_AP29(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N)
+#define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N)
+#define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N)
+#define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N)
+#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N)
+#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N)
+#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N)
+#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N)
+#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N)
+#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N)
+#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N)
+#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N)
+#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N)
+#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N)
+#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N)
+#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N)
+#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N)
+// End generated code
+// clang-format on
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f8681ce37f960b953e6d8dcc50c657c69f1c536
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/EmptyTensor.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+inline void check_size_nonnegative(ArrayRef<int64_t> size) {
+  for (const auto& x : size) {
+    TORCH_CHECK(
+        x >= 0,
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+
+inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
+  for (const auto& x : size) {
+    TORCH_CHECK(
+        x.expect_size(__FILE__, __LINE__),
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+
+TORCH_API size_t computeStorageNbytesContiguous(
+    IntArrayRef sizes,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytesContiguous(
+    SymIntArrayRef sizes,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+TORCH_API size_t computeStorageNbytes(
+    IntArrayRef sizes,
+    IntArrayRef strides,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytes(
+    SymIntArrayRef sizes,
+    SymIntArrayRef strides,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+
+TORCH_API TensorBase empty_generic(
+    IntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_strided_generic(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+
+TORCH_API TensorBase empty_strided_symint_generic(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    bool pin_memory = false,
+    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    bool pin_memory = false);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    ScalarType dtype,
+    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_symint_meta(
+    SymIntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options);
+
+TORCH_API TensorBase
+empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype);
+
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    ScalarType dtype);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    const TensorOptions& options);
+
+} // namespace at::detail
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..8db6be6a643c8cb60cab8487478f9a2f0c817d8b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ExpandBase.h
@@ -0,0 +1,30 @@
+#include <ATen/core/TensorBase.h>
+
+// Broadcasting utilities for working with TensorBase
+namespace at {
+namespace internal {
+TORCH_API TensorBase expand_slow_path(const TensorBase& self, IntArrayRef size);
+} // namespace internal
+
+inline c10::MaybeOwned<TensorBase> expand_size(
+    const TensorBase& self,
+    IntArrayRef size) {
+  if (size.equals(self.sizes())) {
+    return c10::MaybeOwned<TensorBase>::borrowed(self);
+  }
+  return c10::MaybeOwned<TensorBase>::owned(
+      at::internal::expand_slow_path(self, size));
+}
+c10::MaybeOwned<TensorBase> expand_size(TensorBase&& self, IntArrayRef size) =
+    delete;
+
+inline c10::MaybeOwned<TensorBase> expand_inplace(
+    const TensorBase& tensor,
+    const TensorBase& to_expand) {
+  return expand_size(to_expand, tensor.sizes());
+}
+c10::MaybeOwned<TensorBase> expand_inplace(
+    const TensorBase& tensor,
+    TensorBase&& to_expand) = delete;
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..6291b2743459625144e7411b0ebf2c4f0bfaa217
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FunctionalTensorWrapper.h
@@ -0,0 +1,408 @@
+
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/FunctionalStorageImpl.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/List.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+#include <c10/core/DispatchKey.h>
+
+namespace at {
+
+// Note [Functionalization Pass In Core]
+// The Functionalization pass is used to remove aliasing from a pytorch program.
+//
+// This is useful for backends that don't support aliasing, like XLA and Vulkan.
+// It's also necessary in order to remove mutation from a program, which is
+// needed in Functorch.
+//
+// Consider this program:
+// a = torch.ones(...)
+// b = a.view(...)
+// b.add_(1)
+//
+// In this program, b is meant to alias with a due to the use of view(). At the
+// end of the program, both a and b are full of 2's. However, backends that
+// don't support aliasing aren't able to correctly implement the view()
+// operator. Instead, they can opt into the Functionalization pass, which will
+// sit between the user and the backend, and provide the necessary aliasing
+// logic.
+//
+// The functionalization pass will turn the above program into a slightly
+// different program that has the same semantics, transparently to the user,
+// that backends like XLA/Vulkan are able to implement a = torch.ones(...) b =
+// a.view_copy(...)  # view() replaced with view_copy(). Backends like
+// XLA/Vulkan can implement this! b.add_(1) a.add_(1)  # Our functionalization
+// pass machinery knows that a and b are aliased - it applies b's mutation to a
+// too.
+//
+// So, how does the functionalization pass keep track of which tensors are
+// aliased? The pass works by wrapping EVERY tensor in the program inside of a
+// FunctionalTensorWrapper, which knows about its alias'd tensors.
+//
+// See Note [Functionalization: Alias Removal] for details on the aliasing
+// machinery. See Note [Functionalization: Mutation Removal] for details on
+// mutation removal.
+struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
+  explicit FunctionalTensorWrapper(const Tensor& value);
+  // Additional constructor to create a FunctionalTensorWrapper directly from an
+  // underlying tensor that was created from a view. For example, the code b =
+  // a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a,
+  // view1_meta)
+  explicit FunctionalTensorWrapper(
+      const Tensor& view_value,
+      const FunctionalTensorWrapper* base,
+      const functionalization::ViewMeta& meta);
+
+  // Get the underlying, actual tensor, that doesn't know anything about
+  // functionalization.
+  const Tensor& value() const {
+    return value_;
+  };
+  // The concept of "level" is only ever important to functorch; it's exposed
+  // here as more of a hook for functorch to use.
+  int64_t level() const {
+    return level_;
+  };
+  void set_level(int64_t level) {
+    level_ = level;
+  }
+  bool has_metadata_mutation() const {
+    return has_metadata_mutation_;
+  };
+
+  // Denotes a mutation that's hidden from autograd,
+  // e.g. for the purposes of passing a tensor to a triton kernel
+  void mark_mutation_hidden_from_autograd() {
+    mutation_hidden_from_autograd_counter_++;
+  }
+  void mark_mutation_during_no_grad_or_inference_mode() {
+    mutation_during_no_grad_or_inference_mode_++;
+  }
+  // Are all the mutations happening to the tensor hidden from autograd
+  bool are_all_mutations_hidden_from_autograd() const {
+    return mutation_hidden_from_autograd_counter_ == mutation_counter_;
+  }
+  // Did all mutations happen under no_grad or inference_mode
+  // (We also need to ignore mutations fully hidden from autograd here)
+  bool are_all_mutations_under_no_grad_or_inference_mode() const {
+    return mutation_hidden_from_autograd_counter_ +
+        mutation_during_no_grad_or_inference_mode_ ==
+        mutation_counter_;
+  }
+
+  // Sync's the underlying tensor with its alias, if it's out of date. This
+  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
+  // Replay the views (if any) to regenerate the current tensor off of the
+  // updated alias.
+  void sync_();
+  // Performs step (1) of the sync. This is its own public API because it's
+  // needed by view_inplace ops like transpose_. See Note [Functionalization
+  // Pass - Inplace View Ops]
+  void regenerate_from_base();
+  // Performs step (2) of the sync. This is its own public API because it's
+  // needed by functorch. functorch wants to make sure that all input tensors to
+  // a functionalized program have been properly synced so it can properly
+  // propagate mutations to inputs. It can't just call sync_(), because the
+  // FunctionalTensorWrapper will look like it has no aliases and sync_ will be
+  // a noop. We use the reference count on storage_ to determine if the wrapper
+  // is aliased, and by the time functorch is ready to propagate updates to
+  // inputs, any intermediate views of the input created by the program will
+  // have been deallocated. This function also returns whether or not the base
+  // actually had any updates to apply.
+  bool apply_updates();
+  // Takes the current state of value_ and snapshots it, sending it as a pending
+  // update to the alias.
+  void commit_update();
+  // When any tensor is mutated, the tensor increments its alias's "generation".
+  // Separately, each tensor maintains its own "generation" counter, which is
+  // used to determine if it's up-to-date with its alias. The act of syncing a
+  // tensor will set a tensor's generation equal to its alias's generation.
+  bool is_up_to_date() const;
+  // Freezes the storage of this tensor, preventing subsequent mutations
+  void freeze_storage() const;
+  // Every FunctionalTensorWrapper contains a vector<ViewMeta> objects
+  // describing the series of view ops that ran to generate the current tensor
+  // from the base tensor. This method is used by inplace-view ops like
+  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
+  // tensor by replaying the views off of the alias.
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+
+  // Custom implementation of self.set_(src)
+  void set__impl(const FunctionalTensorWrapper* other);
+
+  // Returns whether the current tensor's data was ever mutated
+  bool has_data_mutation();
+  //
+  // Returns whether the current FunctionalTensorWrapper
+  // experienced a set_() call.
+  bool was_storage_changed() {
+    return was_storage_changed_;
+  }
+
+  // The functionalization pass can be used to remove mutations.
+  // It does so by replacing any mutation op with it's corresponding
+  // out-of-place op, followed by a call to replace_(). e.g:
+  //
+  // a.add_(1)
+  //
+  // will turn into:
+  //
+  // tmp = a.add(1)
+  // a.replace_(tmp)
+  //
+  // replace_() swaps out the wrapped tensor, value_, with tmp.
+  void replace_(const Tensor& other);
+
+  bool is_multi_output_view() {
+    return is_multi_output_view_;
+  }
+
+  // See Note[resize_() in functionalization pass]
+  void maybe_replace_storage(const Tensor& other);
+
+  // Replaces the storage with a new functional storage,
+  // and clears the view_metas_ stack.
+  // WARNING: Calling this function will sever the aliasing relationship between
+  // the current FunctionalTensorWrapper and any of its outstanding aliases.
+  // Please only call if you know what you're doing.
+  void _unsafe_reset_storage();
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  ~FunctionalTensorWrapper() override = default;
+
+  // FunctionalTensorWrapper overrides all custom size/stride function,
+  // so that if the inner tensor has a custom implementation
+  // we make sure to call that implementation.
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t dim_custom() const override;
+  int64_t numel_custom() const override;
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymInt sym_size_custom(int64_t d) const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  c10::SymInt sym_storage_offset_custom() const override;
+  c10::Device device_custom() const override;
+
+ private:
+  const char* tensorimpl_type_name() const override;
+  void set_constructor_metadata();
+  functionalization::FunctionalStorageImpl* functional_storage_impl() const;
+
+  // This is used to re-implement shallow_copy_and_detach for
+  // FunctionalTensorWrapper. The implementation is identical, but we just need
+  // to return a subclass instead of a plain TensorImpl.
+  // TODO: maybe it's possible to arrange for that to happen automatically
+  // without an override here?
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+  void copy_tensor_metadata_and_refresh(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  // Note that value is not taken by reference: internally, the wrapper will
+  // change the value tensor that it points to over time.
+  Tensor value_;
+  int64_t level_{};
+  // These two counters are used for identifying
+  // whether all the mutations on a given tensor are hidden from autograd or
+  // not. If we have an input mutation that is hidden from autograd, then once
+  // we convert the input mutation to a copy_() we know it will be safe to hide
+  // the copy_() from autograd as well.
+  uint64_t mutation_counter_ = 0;
+  uint64_t mutation_hidden_from_autograd_counter_ = 0;
+  uint64_t mutation_during_no_grad_or_inference_mode_ = 0;
+  bool has_metadata_mutation_ = false;
+  bool is_multi_output_view_ = false;
+  // Did the tensor experience a set_() call.
+  bool was_storage_changed_ = false;
+
+  size_t generation_ = 0;
+  std::vector<at::functionalization::ViewMeta> view_metas_;
+
+ protected:
+  static void copy_tensor_metadata(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
+};
+
+// Utility functions for the functionalization pass.
+
+namespace functionalization {
+namespace impl {
+
+TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+    const Tensor& tensor) {
+  auto functional_impl =
+      static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
+  return functional_impl;
+}
+
+TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
+TORCH_API bool isFunctionalTensor(const c10::optional<Tensor>& t);
+TORCH_API bool isFunctionalTensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API bool isFunctionalTensor(ITensorListRef list);
+
+TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
+TORCH_API c10::optional<Tensor> to_functional_tensor(
+    const c10::optional<Tensor>& tensor);
+TORCH_API c10::List<c10::optional<Tensor>> to_functional_tensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> to_functional_tensor(ITensorListRef t_list);
+
+TORCH_API void freeze_functional_tensor(const Tensor& tensor);
+
+TORCH_API Tensor
+from_functional_tensor(const Tensor& tensor, bool assert_functional = true);
+TORCH_API c10::optional<Tensor> from_functional_tensor(
+    const c10::optional<Tensor>& t,
+    bool assert_functional = true);
+TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> from_functional_tensor(ITensorListRef t_list);
+
+TORCH_API void sync(const at::Tensor& t);
+TORCH_API void sync(const c10::optional<Tensor>& t);
+TORCH_API void sync(const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API void sync(ITensorListRef t_list);
+
+TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other);
+TORCH_API void replace_(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+
+TORCH_API void commit_update(const Tensor& functional_tensor);
+TORCH_API void commit_update(ITensorListRef functional_tensor);
+
+TORCH_API void unsafe_reset_storage(const Tensor& functional_tensor);
+
+TORCH_API void mark_mutation_hidden_from_autograd(
+    const Tensor& functional_tensor);
+
+TORCH_API bool are_all_mutations_hidden_from_autograd(
+    const Tensor& functional_tensor);
+
+TORCH_API bool are_all_mutations_under_no_grad_or_inference_mode(
+    const Tensor& functional_tensor);
+
+// These two methods are XLA-specific logic and are no-ops
+// for the normal functionalization flow.
+TORCH_API void propagate_xla_data(
+    const Tensor& functional_tensor,
+    const Tensor& other);
+TORCH_API void propagate_xla_data(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+
+Tensor create_functional_tensor_with_view_meta(
+    const Tensor& view_to_wrap,
+    const Tensor& base,
+    functionalization::ViewMeta meta,
+    int64_t out_idx = 0);
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const Tensor& base,
+    const functionalization::ViewMeta& meta);
+
+void mutate_view_meta(
+    const Tensor& self,
+    const functionalization::ViewMeta& meta);
+
+void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
+void set_sizes_strides_offset(
+    const std::vector<Tensor>& outs,
+    const std::vector<Tensor>& meta_outs);
+
+//  ~~~~~ TLS used in functionalization ~~~~~
+
+TORCH_API bool getFunctionalizationReapplyViewsTLS();
+TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views);
+
+class TORCH_API FunctionalizationReapplyViewsGuard {
+ public:
+  FunctionalizationReapplyViewsGuard(bool reapply_views)
+      : prev_(getFunctionalizationReapplyViewsTLS()) {
+    setFunctionalizationReapplyViewsTLS(reapply_views);
+  }
+
+  ~FunctionalizationReapplyViewsGuard() {
+    setFunctionalizationReapplyViewsTLS(prev_);
+  }
+
+  FunctionalizationReapplyViewsGuard(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) =
+      delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      FunctionalizationReapplyViewsGuard&&) = delete;
+
+ private:
+  bool prev_;
+};
+
+} // namespace impl
+
+// Helper function to call an out-of-place composite aten kernel that may use
+// mutations / views internally, and functionalize them.
+TORCH_API void functionalize_op_helper(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op final {};
+
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op<Op, symint, ReturnType(ParameterTypes...)> final {
+  static ReturnType call(
+      typename c10::maybe_keep_symint<symint, ParameterTypes>::type... args) {
+    using FuncType = ReturnType(
+        typename c10::maybe_keep_symint<symint, ParameterTypes>::type...);
+    auto op = c10::Dispatcher::singleton()
+                  .findSchemaOrThrow(
+                      (const char*)Op::name, (const char*)Op::overload_name)
+                  .typed<FuncType>();
+
+    return c10::impl::BoxedKernelWrapper<FuncType>::call(
+        c10::BoxedKernel::makeFromFunction<functionalize_op_helper>(),
+        op,
+        // BoxedKernelWrapper knows to ignore this keyset argument,
+        // because functionalize_op_helper doesn't take in a DispatchKeySet
+        c10::DispatchKeySet(),
+        args...);
+  }
+};
+
+template <class Op>
+using functionalize_aten_op =
+    _functionalize_aten_op<Op, false, typename Op::schema>;
+
+template <class Op>
+using functionalize_aten_op_symint =
+    _functionalize_aten_op<Op, true, typename Op::schema>;
+
+} // namespace functionalization
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..48c25e141dcb8c0264ca9435352889c7a250f74d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Generator.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Generator.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..beef24a6ed9c5b8373a0db5bcc16f268b8c18726
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedFallback.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+namespace at {
+
+// If an operator doesn't have a batching rule implemented then we fallback
+// to this implementation. The fallback only works on out-of-place operators
+// that return only tensors with new memory. (e.g., no in-place operators, no
+// view operations).
+//
+// The fallback effectively takes all of the BatchedTensors in `stack`, slices
+// them, and runs `op` on all of the corresponding slices to produce slices
+// of the outputs. The output slices then get `torch.stack`ed to create the
+// final returns.
+//
+// The performance of the fallback is not very good because it introduces an
+// extra copy from stacking the sliced outputs. Because of this, we prefer to
+// write batching rules for operators whenever possible.
+void batchedTensorForLoopFallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..098fbf9d6292fdeb0fff5e3786b471b742540723
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <bitset>
+
+#include <ATen/ArrayRef.h>
+#include <ATen/SmallVector.h>
+#include <ATen/Tensor.h>
+
+namespace at {
+
+// We assume this in a few other places in the codebase,
+// but there isn't a centralized definition.
+constexpr int64_t kVmapMaxTensorDims = 64;
+
+// The valid vmap levels range from [0, 64). This effectively means that we
+// support a maximum of 64 nested vmaps.
+constexpr int64_t kVmapNumLevels = 64;
+
+// Store this number of elements of BatchDims on the stack. Most people will
+// probably use <= 5 nested vmaps, but adjust this number as necessary.
+constexpr int64_t kBatchDimsStackSize = 5;
+
+// a BatchDim represents a "private" dimension on a Tensor created inside of
+// vmap. It is a (level, dim) tuple, with the `dim` indicating which dimension
+// is being vmap'ed over and the `level` being an identifier for which vmap
+// said dimension was created inside. The `dim` corresponds to a "physical
+// dim" - it is a dimension index on the underlying physical tensor that is
+// being vmapped over.
+struct BatchDim {
+  BatchDim(int64_t level, int64_t dim) : dim_(dim), level_(level) {}
+  int64_t dim() const {
+    return dim_;
+  }
+  int64_t level() const {
+    return level_;
+  }
+
+ private:
+  int64_t dim_;
+  int64_t level_;
+};
+
+using BatchDims = SmallVector<BatchDim, kBatchDimsStackSize>;
+using BatchDimsRef = ArrayRef<BatchDim>;
+
+// A BatchedTensorImpl holds an underlying Tensor and a list of BatchDim
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+//
+// The batch dimensions are treated as being "private"; they are not
+// user-visible. For example, in the following Tensor,
+//    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2, dim=1)])
+// dimensions 0 and 1 are batch dimensions.
+//
+// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
+// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7)
+// tensor.
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
+  explicit BatchedTensorImpl(Tensor value, BatchDims bdims);
+
+  // Returns a reference to BatchDims that represent which dimensions of this
+  // tensor are private.
+  BatchDimsRef bdims() const {
+    return bdims_;
+  }
+
+  // BatchedTensorImpl wraps a Tensor
+  const Tensor& value() const {
+    return value_;
+  };
+
+  // Given a public dimension index, return the dimension index in the
+  // underlying value() tensor. For example, if we have
+  //    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2,
+  //    dim=2)])
+  // bt.actualDim(0) -> 1
+  // bt.actualDim(1) -> 3
+  // bt.actualDim(2) -> Error
+  int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
+
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
+  // Override a bunch of methods inherited from TensorImpl to return error
+  // messages.
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  void set_storage_offset(int64_t storage_offset) override;
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+
+ private:
+  // see NOTE: [BatchedTensorImpl levels invariant]
+  void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
+
+  Tensor value_;
+
+  // Note: [BatchedTensorImpl levels invariant]
+  // There is an invariant that the BatchDims must be stored in increasing
+  // `level` order. That is, for i < j, bdims_[i].level must be less than
+  // bdims_[j].level.
+  BatchDims bdims_;
+};
+
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+inline bool isBatchedTensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::Batched);
+}
+
+// It is unsafe to call this on a Tensor that is not backed by a
+// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
+  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
+  if (!isBatchedTensor(tensor)) {
+    return nullptr;
+  }
+  return unsafeGetBatchedImpl(tensor);
+}
+
+// Returns a bitset. If bit i is set, then that means dim i is a batchdim.
+inline std::bitset<kVmapMaxTensorDims> createBatchDimBitset(
+    BatchDimsRef bdims) {
+  std::bitset<kVmapMaxTensorDims> is_bdim;
+  for (const auto& bdim : bdims) {
+    is_bdim.set(bdim.dim());
+  }
+  return is_bdim;
+}
+
+// Creates a bitset for all of the levels present in `bdims`
+inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
+  std::bitset<kVmapNumLevels> result;
+  for (const auto& bdim : bdims) {
+    result.set(bdim.level());
+  }
+  return result;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  return out;
+}
+
+// Use this to construct a BatchedTensor from a regular Tensor
+TORCH_API Tensor makeBatched(const Tensor& tensor, BatchDims bdims);
+
+// Adds a batch dim to `tensor`, returning a BatchedTensor
+TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t level, int64_t dim);
+
+// Checks if an inplace operation on self and other is "vmap compatible".
+// See NOTE: [vmap-incompatible in-place operations] for the definition of this.
+TORCH_API bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4a30edef623956d5072737336bfca6da5cb2bb4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MapAllocator.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/string_view.h>
+
+namespace at {
+
+enum MappedAllocatorModes {
+  ALLOCATOR_MAPPED_SHARED = 1,
+  ALLOCATOR_MAPPED_SHAREDMEM = 2,
+  ALLOCATOR_MAPPED_EXCLUSIVE = 4,
+  ALLOCATOR_MAPPED_NOCREATE = 8,
+  ALLOCATOR_MAPPED_KEEPFD = 16,
+  ALLOCATOR_MAPPED_FROMFD = 32,
+  ALLOCATOR_MAPPED_UNLINK = 64
+};
+
+// Sentinel value/type to help distinguish the file descriptor constructor from
+// the non-file descriptor constructor
+enum WithFd { WITH_FD };
+
+TORCH_API std::string NewProcessWideShmHandle();
+
+class TORCH_API MapAllocator {
+ public:
+  MapAllocator(c10::string_view filename, int flags, size_t size);
+  MapAllocator(
+      WithFd,
+      c10::string_view filename,
+      int fd,
+      int flags,
+      size_t size);
+  MapAllocator(const MapAllocator&) = delete;
+  MapAllocator& operator=(const MapAllocator&) = delete;
+  MapAllocator(MapAllocator&&) = delete;
+  MapAllocator& operator=(MapAllocator&&) = delete;
+
+  const char* filename() const {
+    return filename_.c_str();
+  }
+  int fd() const {
+#ifdef _WIN32
+    TORCH_CHECK(false, "MapAllocator::fd() is unsupported on Windows");
+#else
+    return fd_;
+#endif
+  }
+  ptrdiff_t size() const {
+    return size_;
+  }
+  // Return a pointer to the actual data for this allocator
+  // (in the case of the refcounted allocator, this is offset
+  // from the base pointer.)
+  virtual void* data() const {
+    return base_ptr_;
+  }
+
+  static MapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(
+      c10::string_view filename,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+
+  // Closes the data.  Helps us avoid destructor shenanigans
+  virtual void close();
+
+  // This is very dangerous.  You have to redefine this destructor for each
+  // subclass
+  virtual ~MapAllocator();
+
+ protected:
+  bool closed_ = false;
+  std::string filename_;
+  int flags_ = 0;
+  ptrdiff_t size_; /* mapped size */
+#ifdef _WIN32
+  void* handle_;
+  void* event_;
+  std::string eventname_;
+#else
+  int fd_ = -1;
+#endif
+  void* base_ptr_ = nullptr;
+};
+
+// Base-from-member idiom
+struct TORCH_API RefcountedMapAllocatorArgCheck {
+  RefcountedMapAllocatorArgCheck(int flags);
+};
+
+class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
+                                         public MapAllocator {
+ public:
+  RefcountedMapAllocator(const char* filename, int flags, size_t size);
+  RefcountedMapAllocator(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size);
+
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(
+      const char* filename,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+
+  void* data() const override;
+
+  void incref();
+  int decref();
+  void close() override;
+
+  ~RefcountedMapAllocator() override {
+    RefcountedMapAllocator::close();
+  }
+
+ protected:
+  void checkFlags();
+  void initializeAlloc();
+};
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7606b0a668a43800b89755af1371551909b23d5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NamedTensor.h
@@ -0,0 +1 @@
+#include <ATen/core/NamedTensor.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ad42ae816274117a276a9545e2057b5eb252a6c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NestedTensorImpl.h
@@ -0,0 +1,283 @@
+#pragma once
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Tensor.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+struct NestedTensorImpl;
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt);
+int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor);
+
+struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
+  explicit NestedTensorImpl(
+      Storage storage,
+      c10::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+  // assume contiguous, `nested_strides` and `offsets`
+  // can be infered from `nested_sizes`
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      const at::Tensor& nested_sizes);
+
+  // This constructor is used creating view tensors from nested tensors
+  explicit NestedTensorImpl(
+      c10::TensorImpl::ImplType impl_type,
+      const at::Tensor& base_tensor,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+
+  // TODO: don't expose private implementation details like this; in
+  // particular, resizing this tensor will mess up our dim() and
+  // callers cannot fix it.
+  const Tensor& get_nested_sizes() const {
+    return nested_sizes_;
+  }
+  // TODO: don't expose private implementation details like this
+  const Tensor& get_nested_strides() const {
+    return nested_strides_;
+  }
+  const Tensor& get_storage_offsets() const {
+    return storage_offsets_;
+  }
+  // Returns nullopt if the ith dimension is irregular. The ith dimension
+  // of a NestedTensor is regular if the unbound tensors match in
+  // size at the (i-1)th dimension.
+  c10::optional<int64_t> opt_size(int64_t d) const;
+
+  int64_t size(int64_t d) const {
+    c10::optional<int64_t> optional_size = this->opt_size(d);
+    TORCH_CHECK(
+        optional_size.has_value(),
+        "Given dimension ",
+        d,
+        " is irregular and does not have a size.");
+    return *optional_size;
+  }
+  /**
+   * Return a view of the nested tensor as a 1 dimensional contiguous tensor.
+   *
+   * The buffer tensor created by this function shares the same storage_impl as
+   * the original nested tensor, and therefore can be seen as a view.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_buffer() const {
+    TORCH_CHECK(
+        nested_tensor_impl_is_contiguous(this),
+        "NestedTensor must be contiguous to get buffer.");
+    return get_unsafe_storage_as_tensor();
+  }
+  /**
+   * If possible use get_buffer() instead. This function returns the storage
+   * as a tensor directly, which is not safe to use in general. If using this
+   * function, The caller must ensure to account for nested_sizes,
+   * nested_strides and storage_offsets.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_unsafe_storage_as_tensor() const {
+    auto buffer_key_set_ = generate_buffer_key_set();
+    const auto buffer_size = get_buffer_size();
+    auto buffer_tensor_impl = c10::make_intrusive<TensorImpl>(
+        c10::TensorImpl::VIEW, Storage(storage_), buffer_key_set_, data_type_);
+    buffer_tensor_impl->set_sizes_contiguous(
+        c10::makeArrayRef(static_cast<int64_t>(buffer_size)));
+    return Tensor(buffer_tensor_impl);
+  }
+
+  size_t get_buffer_size() const {
+    return storage_.nbytes() / data_type_.itemsize();
+  }
+
+ protected:
+  const char* tensorimpl_type_name() const override;
+
+  // TODO: numel_custom and is_contiguous_custom can be profitably overridden
+  // with real implementations
+  int64_t numel_custom() const override;
+  c10::SymInt sym_numel_custom() const override;
+  bool is_contiguous_custom(MemoryFormat) const override;
+  int64_t size_custom(int64_t d) const override {
+    return this->size(d);
+  }
+  c10::SymInt sym_size_custom(int64_t d) const override {
+    return c10::SymInt{this->size(d)};
+  }
+  IntArrayRef sizes_custom() const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  IntArrayRef strides_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+
+  // this one is real
+  int64_t dim_custom() const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override {
+    copy_tensor_metadata(
+        /*src_impl=*/impl.get(),
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+  }
+
+ private:
+  // Must be called after any changes to our dim() to sync the state
+  // to TensorImpl.
+  void refresh_dim();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor nested_sizes_, nested_strides_;
+  // The starting positions of the underlying tensors in contiguous buffer
+  // i.e. the buffer memory offsets to get the underlying tensors
+  // The reason to keep this metadata is that, without strong enough constraint
+  // it cannot be derived from `nested_sizes_`
+  // and `nested_strides_`:
+  // 1. when buffer has blanks, e.g. [tensor1, blank, tensor2]
+  //    this can happen e.g. after slicing a nested tensor
+  // 2. when multiple tensors share a same memory
+  // 3. when the nesting ordering is changed, e.g. [tensor1, tensor3, tensor2]
+  // Some strong enough constraints are:
+  // 1. every underlying tensor is contiguous in memory
+  //    && nesting in ascending order
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor storage_offsets_;
+  // NOTE: -1 here means the size is missing
+  // Optional to allow it to be computed lazily from nested.
+  // TODO: maybe we can remove this metadata since
+  //       we can compute it from `nested_sizes_`
+  mutable c10::optional<std::vector<int64_t>> opt_sizes_;
+
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Generates a non-nested key_set from a nested tensor.
+   *
+   * For many nested tensor kernel implementations a buffer tensor
+   * is generated and redispatched to a non-nested kernel this function
+   * generates the key set used by that buffer tensor
+   *
+   * @return Appropriate key set for non-nested tensor
+   */
+  inline c10::DispatchKeySet generate_buffer_key_set() const {
+    auto buffer_key_set = this->key_set();
+    const bool Autograd = buffer_key_set.has_any(c10::autograd_dispatch_keyset);
+    // Remove nested tensor specific keys
+    buffer_key_set = buffer_key_set -
+        c10::DispatchKeySet{
+            c10::DispatchKey::NestedTensor,
+            c10::DispatchKey::AutogradNestedTensor};
+
+    // Add dense tensor specific keys
+    buffer_key_set =
+        buffer_key_set | c10::DispatchKeySet{c10::DispatchKey::Dense};
+    buffer_key_set = Autograd
+        ? c10::DispatchKeySet{c10::DispatchKey::Autograd} | buffer_key_set
+        : buffer_key_set;
+
+    return buffer_key_set;
+  }
+};
+
+inline NestedTensorImpl* get_nested_tensor_impl_or_null(
+    const at::Tensor& tensor) {
+  if (tensor.is_nested()) {
+    return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+  }
+  return nullptr;
+}
+
+inline NestedTensorImpl* get_nested_tensor_impl(const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.is_nested(), "get_nested_tensor_impl requires a NestedTensor.");
+  return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
+  int64_t ntensors = nt->size(0);
+  if (ntensors == 0) {
+    return true;
+  }
+  const Tensor &sizemat = nt->get_nested_sizes(),
+               &stridemat = nt->get_nested_strides();
+  int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
+  int64_t orig_dim = sizemat.size(1);
+  // nesting scalars
+  if (orig_dim == 0) {
+    // each scalar must be contiguous
+    // if there is blank memory between underlying scalars
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (offsets_ptr[i] != i) {
+        return false;
+      }
+    }
+  }
+  // nesting tensors
+  else {
+    // if any underlying tensor is non-contiguous
+    const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.data_ptr<int64_t>();
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (stridemat_ptr[orig_dim - 1] != 1) {
+        return false;
+      }
+      int64_t product = sizemat_ptr[orig_dim - 1];
+      for (int64_t j = orig_dim - 2; j >= 0; j--) {
+        if (stridemat_ptr[j] != product) {
+          return false;
+        }
+        product *= sizemat_ptr[j];
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+    // if there is blank memory between underlying tensors
+    if (offsets_ptr[0] != 0) {
+      return false;
+    }
+    sizemat_ptr = sizemat.data_ptr<int64_t>();
+    stridemat_ptr = stridemat.data_ptr<int64_t>();
+    for (int64_t i = 1; i < ntensors; i++) {
+      if (offsets_ptr[i] !=
+          offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {
+        return false;
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+  }
+  // everything is fine
+  return true;
+}
+
+inline const at::Tensor& get_nested_sizes(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_nested_sizes();
+}
+
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h
new file mode 100644
index 0000000000000000000000000000000000000000..573d1a7b88ab7367858df90b6adfebfa9b97d5e9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PadNd.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
+
+namespace at {
+
+enum class padding_mode {
+  reflect,
+  replicate,
+  circular,
+  constant,
+};
+
+static inline c10::string_view padding_mode_string(padding_mode m) {
+  switch (m) {
+    case padding_mode::reflect:
+      return "reflect";
+    case padding_mode::replicate:
+      return "replicate";
+    case padding_mode::circular:
+      return "circular";
+    case padding_mode::constant:
+      return "constant";
+  }
+  TORCH_CHECK(false, "Invalid padding mode (", static_cast<int64_t>(m), ")");
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5e682281abe52215288d53fc5c9552d3dd2d483
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel-inl.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/ParallelGuard.h>
+#include <c10/util/SmallVector.h>
+
+namespace at {
+
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grain_size >= 0);
+  if (begin >= end) {
+    return;
+  }
+
+#ifdef INTRA_OP_PARALLEL
+  at::internal::lazy_init_num_threads();
+  const auto numiter = end - begin;
+  const bool use_parallel =
+      (numiter > grain_size && numiter > 1 && !at::in_parallel_region() &&
+       at::get_num_threads() > 1);
+  if (!use_parallel) {
+    internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
+    f(begin, end);
+    return;
+  }
+
+  internal::invoke_parallel(
+      begin, end, grain_size, [&](int64_t begin, int64_t end) {
+        c10::ParallelGuard guard(true);
+        f(begin, end);
+      });
+#else
+  internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
+  f(begin, end);
+#endif
+}
+
+template <class scalar_t, class F, class SF>
+inline scalar_t parallel_reduce(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const scalar_t ident,
+    const F& f,
+    const SF& sf) {
+  TORCH_CHECK(grain_size >= 0);
+  if (begin >= end) {
+    return ident;
+  }
+
+#ifdef INTRA_OP_PARALLEL
+  at::internal::lazy_init_num_threads();
+  const auto max_threads = at::get_num_threads();
+  const bool use_parallel =
+      ((end - begin) > grain_size && !at::in_parallel_region() &&
+       max_threads > 1);
+  if (!use_parallel) {
+    internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
+    return f(begin, end, ident);
+  }
+
+  c10::SmallVector<scalar_t, 64> results(max_threads, ident);
+  internal::invoke_parallel(
+      begin,
+      end,
+      grain_size,
+      [&](const int64_t my_begin, const int64_t my_end) {
+        const auto tid = at::get_thread_num();
+        c10::ParallelGuard guard(true);
+        results[tid] = f(my_begin, my_end, ident);
+      });
+
+  scalar_t result = ident;
+  for (auto partial_result : results) {
+    result = sf(result, partial_result);
+  }
+  return result;
+#else
+  internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
+  return f(begin, end, ident);
+#endif
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff14f568d22a6e0d319bedb4e68194cd0971259e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Parallel.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/Config.h>
+#include <c10/macros/Macros.h>
+#include <functional>
+#include <string>
+
+namespace at {
+
+inline int64_t divup(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+// Called during new thread initialization
+TORCH_API void init_num_threads();
+
+// Sets the number of threads to be used in parallel region
+TORCH_API void set_num_threads(int);
+
+// Returns the maximum number of threads that may be used in a parallel region
+TORCH_API int get_num_threads();
+
+// Returns the current thread number (starting from 0)
+// in the current parallel region, or 0 in the sequential region
+TORCH_API int get_thread_num();
+
+// Checks whether the code runs in parallel region
+TORCH_API bool in_parallel_region();
+
+namespace internal {
+
+// Initialise num_threads lazily at first parallel call
+inline void lazy_init_num_threads() {
+  thread_local bool init = false;
+  if (C10_UNLIKELY(!init)) {
+    at::init_num_threads();
+    init = true;
+  }
+}
+
+TORCH_API void set_thread_num(int);
+
+class TORCH_API ThreadIdGuard {
+ public:
+  ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) {
+    set_thread_num(new_id);
+  }
+
+  ~ThreadIdGuard() {
+    set_thread_num(old_id_);
+  }
+
+ private:
+  int old_id_;
+};
+
+} // namespace internal
+
+/*
+parallel_for
+
+begin: index at which to start applying user function
+
+end: index at which to stop applying user function
+
+grain_size: number of elements per chunk. impacts the degree of parallelization
+
+f: user function applied in parallel to the chunks, signature:
+  void f(int64_t begin, int64_t end)
+
+Warning: parallel_for does NOT copy thread local
+states from the current thread to the worker threads.
+This means for example that Tensor operations CANNOT be used in the
+body of your function, only data pointers.
+*/
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f);
+
+/*
+parallel_reduce
+
+begin: index at which to start applying reduction
+
+end: index at which to stop applying reduction
+
+grain_size: number of elements per chunk. impacts number of elements in
+intermediate results tensor and degree of parallelization.
+
+ident: identity for binary combination function sf. sf(ident, x) needs to return
+x.
+
+f: function for reduction over a chunk. f needs to be of signature scalar_t
+f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+
+sf: function to combine two partial results. sf needs to be of signature
+scalar_t sf(scalar_t x, scalar_t y)
+
+For example, you might have a tensor of 10000 entires and want to sum together
+all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
+an intermediate result tensor with 4 elements. Then it will execute the function
+"f" you provide and pass the beginning and end index of these chunks, so
+0-2499, 2500-4999, etc. and the combination identity. It will then write out
+the result from each of these chunks into the intermediate result tensor. After
+that it'll reduce the partial results from each chunk into a single number using
+the combination function sf and the identity ident. For a total summation this
+would be "+" and 0 respectively. This is similar to tbb's approach [1], where
+you need to provide a function to accumulate a subrange, a function to combine
+two partial results and an identity.
+
+Warning: parallel_reduce does NOT copy thread local
+states from the current thread to the worker threads.
+This means for example that Tensor operations CANNOT be used in the
+body of your function, only data pointers.
+
+[1] https://software.intel.com/en-us/node/506154
+*/
+template <class scalar_t, class F, class SF>
+inline scalar_t parallel_reduce(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const scalar_t ident,
+    const F& f,
+    const SF& sf);
+
+// Returns a detailed string describing parallelization settings
+TORCH_API std::string get_parallel_info();
+
+// Sets number of threads used for inter-op parallelism
+TORCH_API void set_num_interop_threads(int);
+
+// Returns the number of threads used for inter-op parallelism
+TORCH_API int get_num_interop_threads();
+
+// Launches inter-op parallel task
+TORCH_API void launch(std::function<void()> func);
+namespace internal {
+void launch_no_thread_state(std::function<void()> fn);
+} // namespace internal
+
+// Launches intra-op parallel task
+TORCH_API void intraop_launch(std::function<void()> func);
+
+// Returns number of intra-op threads used by default
+TORCH_API int intraop_default_num_threads();
+
+} // namespace at
+
+#if AT_PARALLEL_OPENMP
+#include <ATen/ParallelOpenMP.h> // IWYU pragma: keep
+#elif AT_PARALLEL_NATIVE
+#include <ATen/ParallelNative.h> // IWYU pragma: keep
+#elif AT_PARALLEL_NATIVE_TBB
+#include <ATen/ParallelNativeTBB.h> // IWYU pragma: keep
+#endif
+
+#include <ATen/Parallel-inl.h> // IWYU pragma: keep
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h
new file mode 100644
index 0000000000000000000000000000000000000000..8df093a99065f3a02490d5ec7747112454b6b44b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNative.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+
+#include <c10/util/Exception.h>
+
+#define INTRA_OP_PARALLEL
+
+namespace at::internal {
+
+TORCH_API void invoke_parallel(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+} // namespace at::internal
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..af821cb908c6a6675d21c3543809f9668ffccb64
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SavedTensorHooks.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <c10/util/python_stub.h>
+#include <stack>
+#include <string>
+
+#include <utility>
+
+namespace at {
+
+namespace impl {
+
+struct TORCH_API SavedTensorDefaultHooksTLS {
+  // PyObject is defined in c10/util/python_stub.h
+  std::stack<std::pair<PyObject*, PyObject*>> stack;
+
+  // See NOTE: [Disabling SavedTensorDefaultHooks] for context
+  // NOTE: [disabled_error_message invariant]
+  // disabled_error_message is nullopt IFF Saved Tensor hooks is enabled
+  // We did this for efficiency (so we didn't have to keep a separate bool
+  // around)
+  c10::optional<std::string> disabled_error_message;
+};
+
+} // namespace impl
+
+struct TORCH_API SavedTensorDefaultHooks {
+  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
+  static void pop_hooks();
+  static std::pair<PyObject*, PyObject*> get_hooks();
+  static void lazy_initialize();
+  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
+  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);
+
+  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
+  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
+
+  // NOTE: [Disabling SavedTensorDefaultHooks]
+  // A developer of a PyTorch feature may choose to disable SavedTensorDefault
+  // hooks, especially if their feature does not work with it. If they are
+  // disabled, then the following will raise an error:
+  // - Attempting to push_hooks
+  // - calling disable(message) with a non-zero stack (from get_stack) size
+  static void disable(const std::string& error_message);
+  static void enable();
+  static bool is_enabled();
+  static const c10::optional<std::string>& get_disabled_error_message();
+};
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..528ed7b8762be5f681c759a5ce8a90aa8d4225d7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorAccessor.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec0cb6c8fdfcb2a36139035340d75d96a7930dfc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIteratorInternal.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <ATen/native/TensorIterator.h>
+#include <c10/util/SmallBuffer.h>
+#include <c10/util/irange.h>
+
+namespace at {
+
+struct DimCounter {
+  DimCounter(IntArrayRef shape, Range range);
+
+  void increment(const std::array<int64_t, 2>& step);
+  bool is_done() const;
+  std::array<int64_t, 2> max_2d_step() const;
+
+  IntArrayRef shape;
+  Range range;
+  c10::SmallBuffer<int64_t, 4> values;
+  int64_t offset;
+};
+
+namespace internal {
+
+inline void get_data_ptrs(
+    char** ptrs,
+    ArrayRef<char*> base,
+    IntArrayRef strides,
+    IntArrayRef counter) {
+  const auto ntensors = base.size();
+  const auto ndim = counter.size();
+  std::copy(base.begin(), base.end(), ptrs);
+  for (const auto dim : c10::irange(ndim)) {
+    int64_t value = counter[dim];
+    for (const auto arg : c10::irange(ntensors)) {
+      ptrs[arg] += value * strides[dim * ntensors + arg];
+    }
+  }
+}
+
+inline void serial_for_each(
+    IntArrayRef shape,
+    IntArrayRef strides,
+    char** base_ptrs,
+    size_t ntensors,
+    typename TensorIteratorBase::loop2d_t loop,
+    Range range) {
+  const auto ndim = shape.size();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      strides.size() == ntensors * std::max(size_t{2}, ndim));
+
+  if (ndim <= 1) {
+    if (range.begin == 0) {
+      loop(base_ptrs, strides.data(), range.size(), 1);
+    } else {
+      c10::SmallBuffer<char*, 4> ptrs(ntensors);
+      get_data_ptrs(ptrs.data(), {base_ptrs, ntensors}, strides, {range.begin});
+      loop(ptrs.data(), strides.data(), range.size(), 1);
+    }
+  } else {
+    c10::SmallBuffer<char*, 4> ptrs(ntensors);
+    auto counter = DimCounter(shape, range);
+    while (!counter.is_done()) {
+      get_data_ptrs(
+          ptrs.data(), {base_ptrs, ntensors}, strides, counter.values);
+      auto step = counter.max_2d_step();
+      loop(ptrs.data(), strides.data(), step[0], step[1]);
+      counter.increment(step);
+    }
+  }
+}
+
+} // namespace internal
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c5003a676d80fea79e7facab42a2818d9e2aa74
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorMeta.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <ATen/DimVector.h>
+#include <ATen/core/Dimname.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/strides.h>
+
+namespace at {
+
+class Tensor;
+
+namespace impl {
+
+// Use this to define the prototype for a meta function.  There are two
+// versions; one that takes one argument (just the operator name), or FUNC2
+// variant that takes two arguments (operator name and overload name).
+//
+// Example usage:
+//
+//    TORCH_META_FUNC2(add, Tensor) (
+//      const Tensor& self, const Tensor& other
+//    ) {
+//      ... compute sizes and options ...
+//      set_output(sizes, options);
+//    }
+//
+#define TORCH_META_FUNC(name) void structured_##name::meta
+#define TORCH_META_FUNC2(name, overload) \
+  void structured_##name##_##overload::meta
+
+// These are versions of TORCH_META_FUNC(2) that include a precompute_out struct
+// as a return value. They should be used when the kernel in question has
+// precomputed values declared in native_functions.yaml and the corresponding
+// implementation should return an instance of the aforementioned struct.
+#define TORCH_PRECOMPUTE_META_FUNC(name) \
+  structured_##name::meta_return_ty structured_##name::meta
+#define TORCH_PRECOMPUTE_META_FUNC2(name, overload) \
+  structured_##name##_##overload::meta_return_ty    \
+      structured_##name##_##overload::meta
+
+// Use this to create a precompute struct in a meta function.
+#define TORCH_PRECOMPUTE_STRUCT(name) structured_##name::precompute_out<>
+#define TORCH_PRECOMPUTE_STRUCT2(name, overload) \
+  structured_##name##_##overload::precompute_out<>
+
+// Use this to define the prototype for an implementation.  This takes only
+// one argument, which is the name of the dispatch key entry you're
+// implementing.
+//
+// Example usage:
+//
+//    TORCH_IMPL_FUNC(add_cpu) (
+//      Tensor& result, const Tensor& self, const Tensor& other
+//    ) {
+//      ... do the actual implementation ...
+//    }
+//
+#define TORCH_IMPL_FUNC(name) void structured_##name::impl
+
+// Base class for all structured kernel classes.  The set_output virtual
+// method is varied depending whether or not the operator is
+// functional/out/inplace, and could also be specialized for CPU/CUDA/etc
+// (although presently it isn't).
+//
+// A notable subclass of this interface is TensorIteratorBase.
+struct TORCH_API MetaBase {
+  MetaBase() = default;
+  MetaBase(const MetaBase&) = default;
+  MetaBase& operator=(const MetaBase&) = default;
+  MetaBase(MetaBase&&) noexcept = default;
+  MetaBase& operator=(MetaBase&&) noexcept = default;
+  virtual const Tensor& maybe_get_output(int64_t output_idx) = 0;
+
+  // Note: [set_output_*]
+  // See: https://github.com/pytorch/pytorch/issues/69813
+  // Whenever defining the output properties in the META function of a
+  // structured kernel (what was usually done with `set_output`), use one of
+  // these 3 variants, instead. In order to decide which variant to use, check
+  // the following decision tree:
+  //
+  // - Can the kernel you are going to implement support output tensors
+  //   with arbitrary strides?
+  //     |
+  //     -- YES: `set_output_raw_strided`
+  //     |
+  //     -- NO: Should the output tensor strides be contiguous?
+  //         |
+  //         -- YES: `set_output_contiguous`
+  //         |
+  //         -- NO: `set_output_strided`
+  //
+  // Use this function whenever the kernel requires specific strides for the
+  // output. If `strides` does not match the given output strides, proxy outputs
+  // will be created and passed to the IMPL function.
+  virtual void set_output_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options,
+      DimnameList names = {}) {
+    TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented.");
+  }
+
+  // Use this function whenever the kernel knows how to handle arbitrary strided
+  // outputs. This function has the same behavior as the old `set_output`: it
+  // will only re-stride if the given output was resized.
+  virtual void set_output_raw_strided(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      IntArrayRef strides_hint,
+      TensorOptions options,
+      DimnameList names = {}) {
+    TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented.");
+  }
+
+  // Use this function if the kernel requires contiguous strides.
+  // Alias for `set_output_strided`, but with contiguous strides.
+  void set_output_contiguous(
+      int64_t output_idx,
+      IntArrayRef sizes,
+      TensorOptions options,
+      DimnameList names = {}) {
+    auto strides = c10::contiguous_strides(sizes);
+    set_output_strided(output_idx, sizes, strides, options, names);
+  }
+
+  // Returns a reference to an undefined tensor if there is no presupplied
+  // output
+  const Tensor& maybe_get_output() {
+    return maybe_get_output(0);
+  }
+  virtual ~MetaBase() = default;
+};
+
+} // namespace impl
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h
new file mode 100644
index 0000000000000000000000000000000000000000..616efc14d2599d7f1a9f73f04fdf960e05bfcf2e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorNames.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/WrapDimUtils.h>
+
+namespace at::namedinference {
+
+// TensorName and TensorNames are wrappers around Dimname and DimnameList
+// that contain helper functions to make writing name inference rules easier.
+//
+// A TensorName represents a Dimname associated with some DimnameList (from a
+// Tensor). This encapsulates all the information that is needed to check if
+// names *match* and to *unify* names.
+//
+// Definition: Two names in two tensors *match* if they are equal, or if at
+// least one of them is a wildcard that can be *refined* to the other name.
+//
+// Definition: unify(name, other) fails if the names do not match. Otherwise,
+// it returns the most refined of name and other.
+//
+// Here is an example of checking if two names match.
+// tensor: Tensor[A, None]
+// other: Tensor[A]
+//
+// Let's say we wish to check if tensor.names[-1] matches other.names[-1].
+// None (in tensor) cannot match A (in other) because if the None were refined
+// to A, `tensor` would have duplicate names [A, A]. Therefore we need to check
+// tensor.names [A, None] for the existence of A.
+struct TORCH_API TensorName {
+  explicit TensorName(ArrayRef<Dimname> origin, int origin_idx)
+      : origin_(origin),
+        name_(origin[maybe_wrap_dim(
+            origin_idx,
+            static_cast<int64_t>(origin.size()))]),
+        origin_idx_(origin_idx) {}
+
+  // op_name is only used for error reporting.
+  const TensorName& unify(const TensorName& other, const char* op_name) const;
+  Dimname toDimname() const;
+
+ private:
+  ArrayRef<Dimname> origin_;
+  Dimname name_;
+  int origin_idx_; // A named tensor can have at most 64 dims.
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const TensorName& tensorname);
+};
+
+using TensorNameVec = SmallVector<TensorName, 10>;
+
+struct TORCH_API TensorNames {
+  explicit TensorNames(ArrayRef<Dimname> names);
+
+  // Create TensorNames from names[start:end]. Each individual TensorName stores
+  // `names`, NOT names[start:end], because the original tensor's names are
+  // `names`.
+  explicit TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end);
+
+  // op_name is only used for error reporting.
+  TensorNames& unifyFromRightInplace(
+      const TensorNames& other,
+      const char* op_name = "unify");
+  void checkUnique(const char* op_name) const;
+
+  void append(TensorName name);
+  std::vector<Dimname> toDimnameVec() const;
+
+ private:
+  explicit TensorNames(TensorNameVec&& names) : names_(std::move(names)){};
+
+  TensorNameVec names_;
+};
+
+} // namespace at::namedinference
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h
new file mode 100644
index 0000000000000000000000000000000000000000..7567af4cbfe466843b1d48d78ffd259035cd62dc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOperators.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+namespace at {
+
+#define AT_FORALL_BINARY_OPS(_)                                             \
+  _(+, x.add(y), y.add(x))                                                  \
+  _(*, x.mul(y), y.mul(x))                                                  \
+  _(-,                                                                      \
+    x.sub(y),                                                               \
+    ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).sub_(y))       \
+  _(/,                                                                      \
+    x.div(y),                                                               \
+    ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).div_(y))       \
+  _(%,                                                                      \
+    x.remainder(y),                                                         \
+    ::at::empty_like(y, at::MemoryFormat::Preserve).fill_(x).remainder_(y)) \
+  _(&, x.bitwise_and(y), y.bitwise_and(x))                                  \
+  _(|, x.bitwise_or(y), y.bitwise_or(x))                                    \
+  _(^, x.bitwise_xor(y), y.bitwise_xor(x))                                  \
+  _(<, x.lt(y), y.gt(x))                                                    \
+  _(<=, x.le(y), y.ge(x))                                                   \
+  _(>, x.gt(y), y.lt(x))                                                    \
+  _(>=, x.ge(y), y.le(x))                                                   \
+  _(==, x.eq(y), y.eq(x))                                                   \
+  _(!=, x.ne(y), y.ne(x))
+
+#define DEFINE_OPERATOR(op, body, reverse_scalar_body)                 \
+  static inline Tensor operator op(const Tensor& x, const Tensor& y) { \
+    return body;                                                       \
+  }                                                                    \
+  static inline Tensor operator op(const Tensor& x, const Scalar& y) { \
+    return body;                                                       \
+  }                                                                    \
+  static inline Tensor operator op(const Scalar& x, const Tensor& y) { \
+    return reverse_scalar_body;                                        \
+  }
+
+AT_FORALL_BINARY_OPS(DEFINE_OPERATOR)
+#undef DEFINE_OPERATOR
+#undef AT_FORALL_BINARY_OPS
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3edba8efdf726cea92059cb01e34ee25206482c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorOptions.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/core/TensorOptions.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4615ab50606ee963c48c5673c93196b4188aa9b2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorUtils.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <ATen/DimVector.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/Utils.h>
+
+#include <utility>
+
+// These functions are NOT in Utils.h, because this file has a dep on Tensor.h
+
+#define TORCH_CHECK_TENSOR_ALL(cond, ...) \
+  TORCH_CHECK((cond)._is_all_true().item<bool>(), __VA_ARGS__);
+
+namespace at {
+
+// The following are utility functions for checking that arguments
+// make sense.  These are particularly useful for native functions,
+// which do NO argument checking by default.
+
+struct TORCH_API TensorArg {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const Tensor& tensor;
+  const char* name;
+  int pos; // 1-indexed
+  TensorArg(const Tensor& tensor, const char* name, int pos)
+      : tensor(tensor), name(name), pos(pos) {}
+  // Try to mitigate any possibility of dangling reference to temporaries.
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  TensorArg(Tensor&& tensor, const char* name, int pos) = delete;
+  const Tensor* operator->() const {
+    return &tensor;
+  }
+  const Tensor& operator*() const {
+    return tensor;
+  }
+};
+
+struct TORCH_API TensorGeometryArg {
+  TensorGeometry tensor;
+  const char* name;
+  int pos; // 1-indexed
+  /* implicit */ TensorGeometryArg(TensorArg arg)
+      : tensor(TensorGeometry{arg.tensor}), name(arg.name), pos(arg.pos) {}
+  TensorGeometryArg(TensorGeometry tensor, const char* name, int pos)
+      : tensor(std::move(tensor)), name(name), pos(pos) {}
+  const TensorGeometry* operator->() const {
+    return &tensor;
+  }
+  const TensorGeometry& operator*() const {
+    return tensor;
+  }
+};
+
+// A string describing which function did checks on its input
+// arguments.
+// TODO: Consider generalizing this into a call stack.
+using CheckedFrom = const char*;
+
+// The undefined convention: singular operators assume their arguments
+// are defined, but functions which take multiple tensors will
+// implicitly filter out undefined tensors (to make it easier to perform
+// tests which should apply if the tensor is defined, and should not
+// otherwise.)
+//
+// NB: This means that the n-ary operators take lists of TensorArg,
+// not TensorGeometryArg, because the Tensor to TensorGeometry
+// conversion will blow up if you have undefined tensors.
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& out,
+    const TensorGeometryArg& t);
+TORCH_API void checkDim(
+    CheckedFrom c,
+    const Tensor& tensor,
+    const char* name,
+    int pos, // 1-indexed
+    int64_t dim);
+TORCH_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim);
+// NB: this is an inclusive-exclusive range
+TORCH_API void checkDimRange(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim_start,
+    int64_t dim_end);
+TORCH_API void checkSameDim(
+    CheckedFrom c,
+    const TensorGeometryArg& t1,
+    const TensorGeometryArg& t2);
+TORCH_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
+TORCH_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
+TORCH_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    IntArrayRef sizes);
+TORCH_API void checkSize_symint(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    c10::SymIntArrayRef sizes);
+TORCH_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim,
+    int64_t size);
+TORCH_API void checkSize_symint(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim,
+    const c10::SymInt& size);
+TORCH_API void checkNumel(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t numel);
+TORCH_API void checkSameNumel(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+TORCH_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s);
+TORCH_API void checkScalarTypes(
+    CheckedFrom c,
+    const TensorArg& t,
+    at::ArrayRef<ScalarType> l);
+TORCH_API void checkSameGPU(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+TORCH_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkSameType(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+TORCH_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkSameSize(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+TORCH_API void checkAllSameSize(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkDefined(CheckedFrom c, const TensorArg& t);
+TORCH_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
+
+// FixMe: does TensorArg slow things down?
+TORCH_API void checkBackend(
+    CheckedFrom c,
+    at::ArrayRef<Tensor> t,
+    at::Backend backend);
+
+TORCH_API void checkDeviceType(
+    CheckedFrom c,
+    at::ArrayRef<Tensor> tensors,
+    at::DeviceType device_type);
+
+TORCH_API void checkLayout(CheckedFrom c, const Tensor& t, Layout layout);
+
+TORCH_API void checkLayout(
+    CheckedFrom c,
+    at::ArrayRef<Tensor> tensors,
+    at::Layout layout);
+
+// Methods for getting data_ptr if tensor is defined
+TORCH_API void* maybe_data_ptr(const Tensor& tensor);
+TORCH_API void* maybe_data_ptr(const TensorArg& tensor);
+
+TORCH_API void check_dim_size(
+    const Tensor& tensor,
+    int64_t dim,
+    int64_t dim_size,
+    int64_t size);
+
+namespace detail {
+TORCH_API std::vector<int64_t> defaultStrides(IntArrayRef sizes);
+
+TORCH_API c10::optional<std::vector<int64_t>> computeStride(
+    IntArrayRef oldshape,
+    IntArrayRef oldstride,
+    IntArrayRef newshape);
+
+TORCH_API c10::optional<SymDimVector> computeStride(
+    c10::SymIntArrayRef oldshape,
+    c10::SymIntArrayRef oldstride,
+    c10::SymIntArrayRef newshape);
+
+TORCH_API c10::optional<DimVector> computeStride(
+    IntArrayRef oldshape,
+    IntArrayRef oldstride,
+    const DimVector& newshape);
+
+} // namespace detail
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..17826b332afbcf9f2e2a328ed3a938cc6fecce74
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Utils.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/Formatting.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <sstream>
+#include <typeinfo>
+
+#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;         \
+  void operator=(const TypeName&) = delete
+
+namespace at {
+
+TORCH_API int _crash_if_asan(int);
+
+// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
+// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
+// Once cat is ported entirely to ATen this can be deleted!
+static inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
+    ArrayRef<Tensor> tensors,
+    const char* name,
+    int pos,
+    c10::DeviceType device_type,
+    ScalarType scalar_type) {
+  std::vector<TensorImpl*> unwrapped;
+  unwrapped.reserve(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    const auto& expr = tensors[i];
+    if (expr.layout() != Layout::Strided) {
+      AT_ERROR(
+          "Expected dense tensor but got ",
+          expr.layout(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    if (expr.device().type() != device_type) {
+      AT_ERROR(
+          "Expected object of device type ",
+          device_type,
+          " but got device type ",
+          expr.device().type(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    if (expr.scalar_type() != scalar_type) {
+      AT_ERROR(
+          "Expected object of scalar type ",
+          scalar_type,
+          " but got scalar type ",
+          expr.scalar_type(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    unwrapped.emplace_back(expr.unsafeGetTensorImpl());
+  }
+  return unwrapped;
+}
+
+template <size_t N>
+std::array<int64_t, N> check_intlist(
+    ArrayRef<int64_t> list,
+    const char* name,
+    int pos) {
+  if (list.empty()) {
+    // TODO: is this necessary?  We used to treat nullptr-vs-not in IntList
+    // differently with strides as a way of faking optional.
+    list = {};
+  }
+  auto res = std::array<int64_t, N>();
+  if (list.size() == 1 && N > 1) {
+    res.fill(list[0]);
+    return res;
+  }
+  if (list.size() != N) {
+    AT_ERROR(
+        "Expected a list of ",
+        N,
+        " ints but got ",
+        list.size(),
+        " for argument #",
+        pos,
+        " '",
+        name,
+        "'");
+  }
+  std::copy_n(list.begin(), N, res.begin());
+  return res;
+}
+
+using at::detail::check_size_nonnegative;
+
+namespace detail {
+
+template <typename T>
+TORCH_API Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options);
+
+template <typename T>
+TORCH_API Tensor
+tensor_backend(ArrayRef<T> values, const TensorOptions& options);
+
+template <typename T>
+TORCH_API Tensor
+tensor_complex_cpu(ArrayRef<T> values, const TensorOptions& options);
+
+template <typename T>
+TORCH_API Tensor
+tensor_complex_backend(ArrayRef<T> values, const TensorOptions& options);
+} // namespace detail
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h
new file mode 100644
index 0000000000000000000000000000000000000000..58a23f95c0d2c39d4450e0e70adda091278b9079
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtilsMulti.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <ATen/WrapDimUtils.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/irange.h>
+#include <bitset>
+#include <sstream>
+
+namespace at {
+
+// This is in an extra file to work around strange interaction of
+// bitset on Windows with operator overloading
+
+constexpr size_t dim_bitset_size = 64;
+
+static inline std::bitset<dim_bitset_size> dim_list_to_bitset(
+    OptionalIntArrayRef opt_dims,
+    size_t ndims) {
+  TORCH_CHECK(
+      ndims <= dim_bitset_size,
+      "only tensors with up to ",
+      dim_bitset_size,
+      " dims are supported");
+  std::bitset<dim_bitset_size> seen;
+  if (opt_dims.has_value()) {
+    auto dims = opt_dims.value();
+    for (const auto i : c10::irange(dims.size())) {
+      size_t dim = maybe_wrap_dim(dims[i], static_cast<int64_t>(ndims));
+      TORCH_CHECK(
+          !seen[dim],
+          "dim ",
+          dim,
+          " appears multiple times in the list of dims");
+      seen[dim] = true;
+    }
+  } else {
+    for (size_t dim = 0; dim < ndims; dim++) {
+      seen[dim] = true;
+    }
+  }
+  return seen;
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h
new file mode 100644
index 0000000000000000000000000000000000000000..37d67b232a22c11fa7dccf638b7897c0854ab8bd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ceil_div.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <type_traits>
+
+namespace at {
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+C10_ALWAYS_INLINE C10_HOST_DEVICE T ceil_div(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+/**
+   Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest
+   multiple of b
+*/
+template <typename T>
+C10_ALWAYS_INLINE C10_HOST_DEVICE T round_up(T a, T b) {
+  return ceil_div(a, b) * b;
+}
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..9601a2478ddde2502581f5b1801557a1b57f3853
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/dlpack.h
@@ -0,0 +1,232 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 80
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  const int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  const int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..431ff64cf553f08f92157a0f5f17a8dd8d20acf2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/record_function.h
@@ -0,0 +1,740 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/operator_name.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+
+#include <array>
+#include <functional>
+#include <memory>
+#include <variant>
+
+namespace c10 {
+class TORCH_API OperatorHandle;
+}
+
+namespace at {
+
+// Function name to record NCCL metadata
+extern TORCH_API const std::string kParamCommsCallName;
+
+// Kind of record function scope;
+enum class C10_API_ENUM RecordScope : uint8_t {
+  // c10/ATen ops, autograd nodes
+  FUNCTION = 0,
+  // Functions/nodes called from the autograd
+  BACKWARD_FUNCTION,
+  // TorchScript functions, methods
+  TORCHSCRIPT_FUNCTION,
+  // Kernel Function dtype Tag
+  KERNEL_FUNCTION_DTYPE,
+  // Torchbind custom class,
+  CUSTOM_CLASS,
+  // Generic Build Feature
+  BUILD_FEATURE,
+  // Kernel Function dtype Tag
+  LITE_INTERPRETER,
+  // User defined scope (e.g. with record_function())
+  USER_SCOPE,
+  // Scopes for static runtime, a specialized TorchScript interpreter
+  STATIC_RUNTIME_OP,
+  STATIC_RUNTIME_MODEL,
+  NUM_SCOPES, // must be the last in the list
+};
+
+} // namespace at
+
+namespace std {
+template <>
+struct hash<at::RecordScope> {
+  size_t operator()(const at::RecordScope& sc) const {
+    return static_cast<std::size_t>(sc);
+  }
+};
+} // namespace std
+
+namespace at {
+
+struct TORCH_API StringView {
+  StringView() : StringView(nullptr) {}
+  explicit StringView(const char* str_ptr)
+      : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {}
+  explicit StringView(std::string str)
+      : owned_str_ptr_(std::make_shared<std::string>(std::move(str))),
+        str_ptr_(owned_str_ptr_->c_str()) {}
+
+  const char* str() const {
+    return str_ptr_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const StringView& dt) {
+    os << dt.str();
+    return os;
+  }
+
+  friend bool operator==(const StringView& lhs, const StringView& rhs) {
+    return strcmp(lhs.str(), rhs.str()) == 0;
+  }
+
+  friend bool operator!=(const StringView& lhs, const StringView& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  std::shared_ptr<std::string> owned_str_ptr_;
+  const char* str_ptr_;
+};
+
+// Soft limit on the number of callbacks to use;
+constexpr std::size_t kSoftLimitCallbacks = 4;
+
+// An abstract base class for various observer contexts that can be attached to
+// the RecordFunction.
+struct ObserverContext {
+  virtual ~ObserverContext() = default;
+
+ protected:
+  ObserverContext() = default;
+};
+
+typedef c10::SmallVector<uint64_t, kSoftLimitCallbacks> CallbackHandles;
+typedef c10::SmallVector<std::unique_ptr<ObserverContext>, kSoftLimitCallbacks>
+    ObserverContextList;
+typedef uint64_t RecordFunctionHandle;
+struct RecordFunction;
+
+//
+// PyTorch callbacks/observers API:
+//
+
+/**
+ * RecordFunctionCallback represents a pair of callbacks to be used with
+ * RecordFunction, members:
+ *   start, end - the callbacks to run when entering and exiting the scope;
+ *     optionally, the start callback may return an ObserverContext which will
+ *     be passed to the end callback, use appropriate constructor accordingly.
+ *   needs_inputs - whether the callbacks need the inputs passed from the
+ * observed function/range; NOTE: passing the inputs incurs an additional
+ * overhead; sampling_probability - if not 1.0, then the callback is
+ * probabilistically sampled to run; NOTE: start and end callbacks always run as
+ * a pair and are sampled together; scopes - types of scopes to execute the
+ * callbacks on (see RecordScope); passing empty set means the callbacks will be
+ * executed for all possible scope types should_run - optional function that
+ * returns whether this callback should run; overwrites the effect of setting
+ * sampling_probability
+ */
+class TORCH_API RecordFunctionCallback {
+ public:
+  using StartCallback =
+      std::unique_ptr<ObserverContext> (*)(const RecordFunction&);
+  using EndCallback = void (*)(const RecordFunction&, ObserverContext*);
+
+  // This interface supports observers that require passing an ObserverContext
+  // between start and end callbacks.
+  explicit RecordFunctionCallback(
+      StartCallback start,
+      EndCallback end = nullptr)
+      : start_(start), end_(end) {
+    scopes_.fill(true);
+  }
+
+  RecordFunctionCallback& needsInputs(bool needs_inputs) {
+    needs_inputs_ = needs_inputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsOutputs(bool needs_outputs) {
+    needs_outputs_ = needs_outputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsIds(bool needs_ids) {
+    needs_ids_ = needs_ids;
+    return *this;
+  }
+
+  RecordFunctionCallback& samplingProb(double sampling_prob) {
+    TORCH_CHECK(
+        sampling_prob >= 0.0 && sampling_prob <= 1.0,
+        "Invalid sampling probability");
+    sampling_prob_ = sampling_prob;
+    return *this;
+  }
+
+  RecordFunctionCallback& scopes(
+      const std::unordered_set<RecordScope, std::hash<RecordScope>>& scopes) {
+    if (!scopes.empty()) {
+      scopes_.fill(false);
+      for (auto sc : scopes) {
+        scopes_[static_cast<size_t>(sc)] = true;
+      }
+    } else {
+      scopes_.fill(true);
+    }
+    return *this;
+  }
+
+  bool needsInputs() const {
+    return needs_inputs_;
+  }
+
+  bool needsOutputs() const {
+    return needs_outputs_;
+  }
+
+  bool needsIds() const {
+    return needs_ids_;
+  }
+
+  double samplingProb() const {
+    return sampling_prob_;
+  }
+
+  bool checkScope(RecordScope sc) const {
+    return scopes_[(size_t)sc];
+  }
+
+  StartCallback start() const {
+    return start_;
+  }
+
+  EndCallback end() const {
+    return end_;
+  }
+
+ private:
+  StartCallback start_;
+  EndCallback end_;
+  double sampling_prob_ = 1.0;
+  std::array<bool, static_cast<size_t>(RecordScope::NUM_SCOPES)> scopes_ = {};
+  bool needs_inputs_ = false;
+  bool needs_outputs_ = false;
+  bool needs_ids_ = false;
+};
+
+// Notes:
+//  - two types of callbacks are provided: thread local and global
+//     - thread local callbacks are added/removed only for the given thread
+//       and are stored locally for each thread and separately from the list
+//       of the global callbacks
+//     - global callbacks are stored in a single per process list and are
+//       invoked by every RecordFunction, in addition to the thread local
+//       callbacks specific to the given thread
+//  - we allow the added callbacks to be sampled, by specifying a sampling
+//    probability for each callback pair, if the start callback is
+//    not picked to run, the corresponding end callback won't be called
+//  - a typical use case for the global callbacks is passive monitoring
+//    in the background (e.g. fleet-wide monitoring), without focusing on
+//    the specific piece of code
+//  - in contrast, thread local callbacks are enabled locally, on demand,
+//    for the specific piece of code (range) and are not sampled
+//  - a typical use case for thread local callbacks is profiler and code
+//    execution tracer
+//  - note, thread local callbacks are automatically propagated with
+//    ThreadLocalState across JIT continuations and async tasks (at::launch)
+
+typedef uint64_t CallbackHandle;
+
+constexpr CallbackHandle INVALID_CALLBACK_HANDLE{0};
+
+// It is unnecessary to use atomic operations for enabling
+// thread-local function callbacks. Moreover, it prevents saving to
+// ThreadLocalState because std::atomic is non-copyable.
+struct RecordFunctionCallbacksEntry {
+  RecordFunctionCallbacksEntry(RecordFunctionCallback cb, CallbackHandle h)
+      : callback_(cb), handle_(h) {}
+
+  RecordFunctionCallback callback_;
+  bool enabled_{true};
+  CallbackHandle handle_;
+};
+
+// Holds pairs (callbacks, unique_id)
+using RecordFunctionCallbacks = std::vector<RecordFunctionCallbacksEntry>;
+
+// Generated by the callback managers to determine which functions to run.
+struct StepCallbacks {
+  StepCallbacks() = default;
+  StepCallbacks(uint64_t thread_id, RecordScope scope)
+      : thread_id_{thread_id}, scope_{scope} {}
+
+  bool empty() const {
+    return callbacks_.empty();
+  }
+
+  struct StartEndPair {
+    RecordFunctionCallback::StartCallback start_;
+    RecordFunctionCallback::EndCallback end_;
+  };
+
+  using StartEndPairs = c10::SmallVector<StartEndPair, kSoftLimitCallbacks>;
+
+  StartEndPairs callbacks_;
+  uint64_t thread_id_{0};
+  RecordScope scope_{RecordScope::FUNCTION};
+  bool needs_inputs_{false};
+  bool needs_outputs_{false};
+  bool needs_ids_{false};
+};
+
+struct TORCH_API RecordFunction {
+  // Default constructor is used with before function called afterwards:
+  //  scope - record scope that this function tracks
+  //  pre_sampled - whether this RecordFunction was already pre-sampled with
+  //    kLowProb probability
+  explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION);
+  explicit RecordFunction(StepCallbacks&& step_callbacks);
+
+  template <typename F>
+  void before(
+      F fn,
+      c10::ArrayRef<const c10::IValue> args,
+      int64_t current_sequence_nr = -1) {
+    if (!isActive()) {
+      return;
+    }
+    inputs_ = args;
+    before(fn, current_sequence_nr);
+  }
+
+  template <typename F>
+  void before(
+      F fn,
+      const std::vector<IValue>* args,
+      int64_t current_sequence_nr = -1) {
+    before(
+        std::move(fn),
+        c10::ArrayRef<const c10::IValue>(args->data(), args->size()),
+        current_sequence_nr);
+  }
+
+  // Destructor calls end callbacks
+  virtual ~RecordFunction();
+
+  RecordFunction(const RecordFunction&) = delete;
+  RecordFunction& operator=(const RecordFunction&) = delete;
+
+  const char* name() const;
+
+  int64_t seqNr() const {
+    return sequence_nr_;
+  }
+
+  c10::ArrayRef<const IValue> inputs() const {
+#ifndef NDEBUG
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        inputs_valid_, "Called inputs() outside RecordFunction start callback");
+#endif
+    return inputs_;
+  }
+
+  const std::vector<c10::IValue>& outputs() const {
+    return outputs_;
+  }
+
+  void setOutputs(std::vector<c10::IValue>&& outputs) {
+    outputs_ = std::move(outputs);
+  }
+
+  void setOutputs(c10::ArrayRef<c10::IValue> outputs) {
+    outputs_ = outputs.vec();
+  }
+
+  size_t num_inputs() const;
+  size_t num_outputs() const;
+
+  // Retrieves the thread_id that this RecordFunction ran start callbacks with.
+  // Useful for writing thread safe end callbacks that may be potentially
+  // executed in a different thread (async ops)
+  uint64_t threadId() const {
+    return step_callbacks_.thread_id_;
+  }
+
+  // For backward functions - thread id of the corresponding forward function,
+  // or zero otherwise;
+  // used alongside with sequence number to correlate backward functions with
+  // the forward ones
+  uint64_t forwardThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setForwardThreadId(uint64_t thread_id) {
+    fwd_thread_id_ = thread_id;
+  }
+
+  RecordScope scope() const {
+    return step_callbacks_.scope_;
+  }
+
+  // Returns logical thread_id for the current thread
+  static uint64_t currentThreadId();
+
+  // Internal functions, do not use directly;
+  // used in python's context manager
+
+  // before functions initialize RecordFunction members and call
+  // start callbacks
+  using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
+  void before(const char* name, int64_t sequence_nr = -1);
+  void before(std::string name, int64_t sequence_nr = -1);
+  void before(schema_ref_t schema, int64_t sequence_nr = -1);
+
+  // Sets node ID for distributed profiling
+  static void setDefaultNodeId(int64_t defaultNodeId);
+  // Gets node ID for distributed profiling
+  static int64_t getDefaultNodeId();
+
+  // Calls end callbacks. After end(), accessors will no longer provide useful
+  // results.
+  void end();
+
+  // Internal-only, used only force async event for distributed events
+  // profiling.
+  void _setAsync();
+
+  // Returns whether this RecordFunction corresponds to an async event or not.
+  bool isAsync() const;
+
+  // Returns whether this RecordFunction corresponds to NCCL metadata collection
+  // or not.
+  bool isNcclMeta() const {
+    return is_nccl_meta_;
+  }
+
+  // Internal-only, used to denote out variant used for Static Runtime execution
+  void _setStaticRuntimeOutVariant();
+  bool isStaticRuntimeOutVariant() const;
+
+  RecordFunctionHandle handle() const {
+    return handle_;
+  }
+
+  c10::optional<OperatorName> operator_name() const;
+
+  // This method returns a copy of the FunctionSchema and can be expensive.
+  c10::optional<FunctionSchema> operator_schema() const;
+
+  void setHandle(RecordFunctionHandle handle) {
+    handle_ = handle;
+  }
+
+  // Whether this RecordFunction runs any callbacks.
+  bool isActive() const {
+    return !step_callbacks_.empty();
+  }
+
+  bool needsInputs() const {
+    return step_callbacks_.needs_inputs_;
+  }
+
+  bool needsOutputs() const {
+    return step_callbacks_.needs_outputs_;
+  }
+
+  int64_t debugHandle() const {
+    return debug_handle_;
+  }
+
+  void setDebugHandle(int64_t debug_handle) {
+    debug_handle_ = debug_handle;
+  }
+
+  void invalidateInputs() {
+#ifndef NDEBUG
+    inputs_valid_ = false;
+#endif
+  }
+
+ private:
+  void runStartCallbacks();
+
+  StepCallbacks step_callbacks_;
+
+  // In cases when RecordFunction might be active but we chose not to
+  // use the observers (e.g. operator is not observed), this boolean
+  // flag is used to check whether the start callbacks were called
+  bool called_start_callbacks_ = false;
+
+#ifndef NDEBUG
+  bool inputs_valid_ = false;
+#endif
+
+  // Stores various ObserverContext objects with event metadata for callbacks.
+  ObserverContextList ctx_;
+
+  std::variant<std::string, schema_ref_t> fn_;
+
+  int64_t sequence_nr_ = -1;
+  c10::ArrayRef<const IValue> inputs_;
+  std::vector<c10::IValue> outputs_;
+
+  // For backward functions - thread id of the forward function
+  uint64_t fwd_thread_id_ = 0;
+
+  // Unique id for this RecordFunction, used in callbacks to track start
+  // and end of ranges
+  RecordFunctionHandle handle_{0};
+
+  // Whether this record_function corresponds to an async event or not. Async
+  // events can complete in different threads or follow a future-like pattern
+  // of use.
+  bool is_async_{false};
+
+  // Debug handles are used for lazy annotation of module hierarchy
+  // and callstack.
+  // This is specifically is useful for mobile runtime, where generated
+  // debug handles can be lazily symbolicated using debug information
+  int64_t debug_handle_{-1};
+
+  // Whether this RecordFunction is used for an out variant run with
+  // Static Runtime
+  bool is_static_runtime_out_variant_{false};
+
+  // Whether this RecordFunction is used for NCCL metadata collection
+  bool is_nccl_meta_{false};
+};
+
+TORCH_API StepCallbacks getStepCallbacks(RecordScope scope);
+
+TORCH_API c10::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
+    RecordScope scope);
+
+namespace detail {
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+    const Inputs& inputs,
+    Args&&... args) {
+  if (guard.needsInputs()) {
+    guard.before(
+        fn,
+        c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
+        std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
+  }
+}
+
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+    int64_t debug_handle,
+    const Inputs& inputs,
+    Args&&... args) {
+  guard.setDebugHandle(debug_handle);
+  if (guard.needsInputs()) {
+    guard.before(
+        fn,
+        c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
+        std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
+  }
+}
+
+template <typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+    c10::ArrayRef<const c10::IValue> inputs,
+    Args&&... args) {
+  return record_function_with_scope<
+      c10::ArrayRef<const c10::IValue>,
+      F,
+      Args...>(guard, std::move(fn), inputs, std::forward<Args>(args)...);
+}
+
+template <typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+    int64_t debug_handle,
+    c10::ArrayRef<const c10::IValue> inputs,
+    Args&&... args) {
+  return record_function_with_scope_and_debug_handle<
+      c10::ArrayRef<const c10::IValue>,
+      F,
+      Args...>(
+      guard, std::move(fn), debug_handle, inputs, std::forward<Args>(args)...);
+}
+
+} // namespace detail
+
+// optional argument - function's seq_no
+#define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \
+  at::RecordFunction guard(scope);                         \
+  if (guard.isActive()) {                                  \
+    ::at::detail::record_function_with_scope(              \
+        guard, fn, inputs, ##__VA_ARGS__);                 \
+  }
+
+#define RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS( \
+    scope, fn, inputs, outputs, ...)               \
+  at::RecordFunction guard(scope);                 \
+  if (guard.isActive()) {                          \
+    if (guard.needsInputs()) {                     \
+      guard.before(fn, inputs, ##__VA_ARGS__);     \
+    } else {                                       \
+      guard.before(fn, ##__VA_ARGS__);             \
+    }                                              \
+    if (guard.needsOutputs()) {                    \
+      guard.setOutputs(outputs);                   \
+    }                                              \
+  }
+
+#define RECORD_FUNCTION(fn, inputs, ...) \
+  RECORD_FUNCTION_WITH_SCOPE(            \
+      at::RecordScope::FUNCTION, fn, inputs, ##__VA_ARGS__)
+
+#define RECORD_TORCHSCRIPT_FUNCTION(mn, inputs) \
+  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::TORCHSCRIPT_FUNCTION, mn, inputs)
+
+#define RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(fn, inputs, outputs, ...) \
+  RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS(                          \
+      at::RecordScope::FUNCTION, fn, inputs, outputs, ##__VA_ARGS__)
+
+// Custom user scopes in C++; similar to Python's 'with record_function("..."):'
+#define RECORD_USER_SCOPE(fn) \
+  RECORD_FUNCTION_WITH_SCOPE( \
+      at::RecordScope::USER_SCOPE, fn, c10::ArrayRef<const c10::IValue>{})
+
+// RECORD_USER_SCOPE with inputs
+#define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \
+  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::USER_SCOPE, fn, inputs)
+
+// Helper macro to pass in debug handle that is used to
+// post process events
+#define RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(             \
+    scope, fn, debug_handle, inputs, ...)                      \
+  at::RecordFunction guard(scope);                             \
+  if (guard.isActive()) {                                      \
+    ::at::detail::record_function_with_scope_and_debug_handle( \
+        guard, fn, debug_handle, inputs, ##__VA_ARGS__);       \
+  }
+
+// Helper macros to record LITE INTERPETER scope events with debug handles
+#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
+    fn, debug_handle, inputs)                           \
+  RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(            \
+      at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
+
+// Bookend to the RECORD_FUNCTION macros.  Use this after the kernel
+// launch to let the profiler bind the outputs to the op that produced
+// them.  Note that guard is declared by RECORD_FUNCTION so this macro
+// needs to be called from the same scope as RECORD_FUNCTION
+#define RECORD_OUTPUTS(outputs)                                    \
+  if (guard.needsOutputs()) {                                      \
+    guard.setOutputs(                                              \
+        std::vector<c10::IValue>(outputs.begin(), outputs.end())); \
+  }
+
+/**
+ * addThreadLocalCallback adds a thread local callback to run with
+ * RecordFunction, returns handle to use with removeThreadLocalCallback
+ */
+TORCH_API CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb);
+
+/**
+ * hasThreadLocalCallbacks returns whether there're callbacks registered
+ * with addThreadLocalCallback
+ */
+TORCH_API bool hasThreadLocalCallbacks();
+
+/**
+ * clearThreadLocalCallbacks removes all thread local callbacks
+ */
+TORCH_API void clearThreadLocalCallbacks();
+
+/**
+ * addGlobalCallback adds a global callback to run with RecordFunction:
+ *
+ * only during the program initialization
+ */
+TORCH_API CallbackHandle addGlobalCallback(RecordFunctionCallback cb);
+
+/**
+ * removeCallback removes a callback given the handle returned by
+ * addThreadLocalCallback or addGlobalCallback;
+ *
+ * no other code can run simultaneously
+ */
+TORCH_API void removeCallback(CallbackHandle handle);
+
+/**
+ * Prevent the given callback from executing. If handle is invalid,
+ * does nothing.
+ */
+TORCH_API void disableCallback(CallbackHandle handle);
+
+/**
+ * Allow the given callback, previously disabled with disableCallback, to
+ * execute again. If handle is invalid, does nothing.
+ */
+TORCH_API void reenableCallback(CallbackHandle handle);
+
+/**
+ * hasGlobalCallbacks returns whether there're global callbacks
+ * registered with pushGlobalCallback
+ */
+TORCH_API bool hasGlobalCallbacks();
+
+/**
+ * clearGlobalCallbacks removes all global callbacks
+ */
+TORCH_API void clearGlobalCallbacks();
+
+// for both thread local and global callbacks
+TORCH_API bool hasCallbacks();
+TORCH_API void clearCallbacks();
+
+/**
+ * enableRecordFunction enables RecordFunction thread locally
+ */
+TORCH_API void enableRecordFunction(bool enable = true);
+
+/**
+ * isRecordFunctionEnabled returns whether RecordFunction
+ * is enabled thread locally
+ */
+TORCH_API bool isRecordFunctionEnabled();
+
+class TORCH_API RecordFunctionGuard {
+ public:
+  explicit RecordFunctionGuard(bool is_enabled = true)
+      : prev_value_(isRecordFunctionEnabled()) {
+    enableRecordFunction(is_enabled);
+  }
+
+  virtual ~RecordFunctionGuard() {
+    enableRecordFunction(prev_value_);
+  }
+
+ private:
+  bool prev_value_ = false;
+};
+
+class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
+ public:
+  DisableRecordFunctionGuard() : RecordFunctionGuard(false) {}
+  ~DisableRecordFunctionGuard() override = default;
+};
+
+struct TORCH_API RecordFunctionTLS {
+  // Thread local vector of callbacks, holds pairs (callbacks, unique_id);
+  // must be sorted in increasing handles order
+  RecordFunctionCallbacks sorted_tls_callbacks_;
+
+  bool tls_record_function_enabled_ = true;
+};
+
+TORCH_API const RecordFunctionTLS& get_record_function_tls_();
+
+TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
+
+TORCH_API void set_record_function_seed_for_testing(uint32_t seed);
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ccc8e69523f7dbee2335b788a2cb3a7db618a2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/_numeric_suite.py
@@ -0,0 +1,28 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/ns/_numeric_suite.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.ns._numeric_suite import (
+    _convert_tuple_to_list,
+    _dequantize_tensor_list,
+    _find_match,
+    _get_logger_dict_helper,
+    _is_identical_module_type,
+    compare_model_outputs,
+    compare_model_stub,
+    compare_weights,
+    get_logger_dict,
+    get_matching_activations,
+    Logger,
+    NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST,
+    OutputLogger,
+    prepare_model_outputs,
+    prepare_model_with_stubs,
+    Shadow,
+    ShadowLogger,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a5d730bfb68e89e24beb04ad13fd3fa5881ae9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fake_quantize.py
@@ -0,0 +1,32 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/fake_quantize.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.fake_quantize import (
+    _is_fake_quant_script_module,
+    _is_per_channel,
+    _is_per_tensor,
+    _is_symmetric_quant,
+    default_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
+    default_fused_act_fake_quant,
+    default_fused_per_channel_wt_fake_quant,
+    default_fused_wt_fake_quant,
+    default_histogram_fake_quant,
+    default_per_channel_weight_fake_quant,
+    default_weight_fake_quant,
+    disable_fake_quant,
+    disable_observer,
+    enable_fake_quant,
+    enable_observer,
+    FakeQuantize,
+    FakeQuantizeBase,
+    FixedQParamsFakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b704fa8094e8b367e9eba47102863ba845415b9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fuse_modules.py
@@ -0,0 +1,22 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/fuse_modules.py`, while adding an import statement
+here.
+"""
+
+# TODO: These functions are not used outside the `fuse_modules.py`
+#       Keeping here for now, need to remove them later.
+from torch.ao.quantization.fuse_modules import (
+    _fuse_modules,
+    _get_module,
+    _set_module,
+    fuse_known_modules,
+    fuse_modules,
+    get_fuser_method,
+)
+
+# for backward compatiblity
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn, fuse_conv_bn_relu
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..281b4c8770a119f4c33bb9130f092d92116905af
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25a1dbd691b06742ef92505fb9f06f787d0c6172
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e152dd07205d0a20400f67caabba463714acea1d
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..7acea4f84a2a0a82f134b6790e573f8f1cb677f2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/_equalize.py
@@ -0,0 +1,38 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx._equalize import (
+    _convert_equalization_ref,
+    _InputEqualizationObserver,
+    _WeightEqualizationObserver,
+    calculate_equalization_scale,
+    clear_weight_quant_obs_node,
+    convert_eq_obs,
+    CUSTOM_MODULE_SUPP_LIST,
+    custom_module_supports_equalization,
+    default_equalization_qconfig,
+    EqualizationQConfig,
+    fused_module_supports_equalization,
+    get_equalization_qconfig_dict,
+    get_layer_sqnr_dict,
+    get_op_node_and_weight_eq_obs,
+    input_equalization_observer,
+    is_equalization_observer,
+    maybe_get_next_equalization_scale,
+    maybe_get_next_input_eq_obs,
+    maybe_get_weight_eq_obs_node,
+    nn_module_supports_equalization,
+    node_supports_equalization,
+    remove_node,
+    reshape_scale,
+    scale_input_observer,
+    scale_weight_functional,
+    scale_weight_node,
+    update_obs_for_equalization,
+    weight_equalization_observer,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py
new file mode 100644
index 0000000000000000000000000000000000000000..67527080304fb31ddc54fe254533e2196f77a616
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fuse.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.fuse import fuse
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29337b3f861e5b54dc9f37d39d12ad975ad1315
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/fusion_patterns.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.fuse_handler import DefaultFuseHandler, FuseHandler
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..a422cdd3142e04c8d16f495cc6cd65823451810b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/quantization_types.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.utils import Pattern, QuantizerCls
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef35559884b7c430f1d5c72b21f72979108469a5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/fx/utils.py
@@ -0,0 +1,20 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.utils import (
+    all_node_args_have_no_tensors,
+    assert_and_get_unique_device,
+    create_getattr_from_value,
+    get_custom_module_class_keys,
+    get_linear_prepack_op_for_dtype,
+    get_new_attr_name_with_prefix,
+    get_non_observable_arg_indexes_and_types,
+    get_qconv_prepack_op,
+    graph_module_from_producer_nodes,
+    maybe_get_next_module,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb7e14110cb9cdc4e9c2c418c6776ea6445f0d3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/qconfig.py
@@ -0,0 +1,30 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/qconfig.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.qconfig import (
+    _add_module_to_qconfig_obs_ctr,
+    _assert_valid_qconfig,
+    default_activation_only_qconfig,
+    default_debug_qconfig,
+    default_dynamic_qconfig,
+    default_per_channel_qconfig,
+    default_qat_qconfig,
+    default_qat_qconfig_v2,
+    default_qconfig,
+    default_weight_only_qconfig,
+    float16_dynamic_qconfig,
+    float16_static_qconfig,
+    float_qparams_weight_only_qconfig,
+    get_default_qat_qconfig,
+    get_default_qconfig,
+    per_channel_dynamic_qconfig,
+    QConfig,
+    qconfig_equals,
+    QConfigAny,
+    QConfigDynamic,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..8555f03792661f39c85c8facf3f911786cc25d0f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quant_type.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quant_type.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.quant_type import _get_quant_type_to_str, QuantType
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b44a980ce82fbfa5a81ad906499806cf99b876f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantization_mappings.py
@@ -0,0 +1,29 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantization_mappings.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.quantization_mappings import (
+    _get_special_act_post_process,
+    _has_special_act_post_process,
+    _INCLUDE_QCONFIG_PROPAGATE_LIST,
+    DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
+    DEFAULT_MODULE_TO_ACT_POST_PROCESS,
+    DEFAULT_QAT_MODULE_MAPPINGS,
+    DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS,
+    DEFAULT_STATIC_QUANT_MODULE_MAPPINGS,
+    get_default_compare_output_module_list,
+    get_default_dynamic_quant_module_mappings,
+    get_default_float_to_quantized_operator_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    get_default_static_quant_module_mappings,
+    get_dynamic_quant_module_class,
+    get_quantized_operator,
+    get_static_quant_module_class,
+    no_observer_set,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..600d3a46fed0346e3ae8909872cd5bf3c733860c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/quantize.py
@@ -0,0 +1,30 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantize.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.quantize import (
+    _add_observer_,
+    _convert,
+    _get_observer_dict,
+    _get_unique_devices_,
+    _is_activation_post_process,
+    _observer_forward_hook,
+    _propagate_qconfig_helper,
+    _register_activation_post_process_hook,
+    _remove_activation_post_process,
+    _remove_qconfig,
+    add_quant_dequant,
+    convert,
+    prepare,
+    prepare_qat,
+    propagate_qconfig_,
+    quantize,
+    quantize_dynamic,
+    quantize_qat,
+    swap_module,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3fd5c63683dc572c35cabc202ee4ddb2b0053c6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/stubs.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/stubs.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.stubs import DeQuantStub, QuantStub, QuantWrapper
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d51d58f38d7462713f84ab62427852c1dd8e52c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/utils.py
@@ -0,0 +1,29 @@
+# flake8: noqa: F401
+r"""
+Utils shared by different modes of quantization (eager/graph)
+
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/utils.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.utils import (
+    activation_dtype,
+    activation_is_int8_quantized,
+    activation_is_statically_quantized,
+    calculate_qmin_qmax,
+    check_min_max_valid,
+    get_combined_dict,
+    get_qconfig_dtypes,
+    get_qparam_dict,
+    get_quant_type,
+    get_swapped_custom_module_class,
+    getattr_from_fqn,
+    is_per_channel,
+    is_per_tensor,
+    weight_dtype,
+    weight_is_quantized,
+    weight_is_statically_quantized,
+)