diff --git a/.gitattributes b/.gitattributes
index 8c9edf9b3920f496bfd13070c9efa60592961d6d..9fd5cb3f4b31ed059a55aed8199713901de48e97 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -77,3 +77,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/_
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe
new file mode 100644
index 0000000000000000000000000000000000000000..4c236ed0ab8253ceee9276ddc3cf5ed9a7ea6a4c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc4c06b7d95e74e315419ee7e88e1d0f71e9e9477538c00a93a9ff8c66a6cfc
+size 182784
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e594f596067b98072f4c85548d35b11d97e050aa
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e5ec6ab82a30ae8b6b7bf88ff8dddc33d3e53a2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py
@@ -0,0 +1,2727 @@
+from __future__ import annotations
+
+import base64
+import copyreg
+import dataclasses
+import functools
+import hashlib
+import importlib
+import io
+import json
+import logging
+import multiprocessing
+import os
+import pathlib
+import pickle
+import pkgutil
+import platform
+import re
+import shlex
+import shutil
+import signal
+import subprocess
+import sys
+import sysconfig
+import tempfile
+import textwrap
+import threading
+import warnings
+import weakref
+from bisect import bisect_right
+from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
+from copy import copy
+from ctypes import c_void_p, cdll, CDLL
+from functools import partial
+from pathlib import Path
+from threading import Thread
+from time import sleep, time
+from types import ModuleType
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+
+import torch
+
+from torch._dynamo.device_interface import (
+    get_interface_for_device,
+    get_registered_device_interfaces,
+)
+from torch._dynamo.utils import counters, dynamo_timed
+from torch._inductor import config, exc, metrics
+from torch._inductor.codegen.cuda import cuda_env
+from torch._inductor.utils import cache_dir, developer_warning, is_linux
+from torch._subclasses.fake_tensor import (
+    extract_tensor_metadata,
+    FakeTensor,
+    TensorMetadata,
+)
+from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
+
+if TYPE_CHECKING:
+    from torch._inductor.graph import GraphLowering
+    from torch._inductor.select_algorithm import ChoiceCaller
+
+from torch.hub import _Faketqdm, tqdm
+
+_HERE = os.path.abspath(__file__)
+_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
+_LINKER_SCRIPT = os.path.join(_TORCH_PATH, "_inductor/script.ld")
+
+if config.is_fbcode():
+    from triton.fb import build_paths
+    from triton.fb.build import _run_build_command
+
+    from torch._inductor.fb.utils import (
+        log_global_cache_errors,
+        log_global_cache_stats,
+        log_global_cache_vals,
+        use_global_cache,
+    )
+else:
+
+    def log_global_cache_errors(*args, **kwargs):
+        pass
+
+    def log_global_cache_stats(*args, **kwargs):
+        pass
+
+    def log_global_cache_vals(*args, **kwargs):
+        pass
+
+    def use_global_cache() -> bool:
+        return False
+
+
+LOCK_TIMEOUT = 600
+
+# timing metrics for time spent in the compilation
+_cumulative_compile_time = 0.0
+_t0: Optional[float] = None
+
+
+def _compile_start() -> None:
+    global _t0
+    if _t0 is None:
+        _t0 = time()
+
+
+def _compile_end() -> None:
+    global _cumulative_compile_time, _t0
+    if _t0 is not None:
+        t1 = time()
+        _cumulative_compile_time += t1 - _t0
+        _t0 = None
+        # print("CUMULATIVE COMPILE TIME", _cumulative_compile_time)
+
+
+log = logging.getLogger(__name__)
+
+
+def cpp_wrapper_cache_dir(name: str) -> str:
+    cu_str = (
+        "cpu"
+        if torch.version.cuda is None
+        else f'cu{torch.version.cuda.replace(".", "")}'
+    )
+    python_version = f"py{sys.version_info.major}{sys.version_info.minor}"
+    build_folder = f"{python_version}_{cu_str}"
+
+    cpp_wrapper_dir = os.path.join(cache_dir(), build_folder)
+    cpp_wrapper_build_directory = os.path.join(cpp_wrapper_dir, name)
+    os.makedirs(cpp_wrapper_build_directory, exist_ok=True)
+    return cpp_wrapper_build_directory
+
+
+def get_cpp_wrapper_cubin_path_name():
+    return "cubin_path" if torch.version.hip is None else "hsaco_path"
+
+
+class CacheBase:
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_system() -> Dict[str, Any]:
+        try:
+            import triton
+
+            triton_version = triton.__version__
+        except ModuleNotFoundError:
+            triton_version = None
+
+        try:
+            system: Dict[str, Any] = {
+                "device": {
+                    "name": torch.cuda.get_device_properties(
+                        torch.cuda.current_device()
+                    ).name,
+                },
+                "version": {
+                    "cuda": torch.version.cuda,
+                    "triton": triton_version,
+                },
+            }
+        except (AssertionError, RuntimeError):
+            # If cuda is not installed, none of the above config is relevant.
+            system = {}
+
+        system["hash"] = hashlib.sha256(
+            json.dumps(system, sort_keys=True).encode("utf-8")
+        ).hexdigest()
+
+        return system
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_local_cache_path() -> Path:
+        return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_global_cache_path() -> Optional[Path]:
+        return (
+            Path(os.path.join(config.global_cache_dir, CacheBase.get_system()["hash"]))
+            if config.global_cache_dir is not None
+            else None
+        )
+
+    def __init__(self) -> None:
+        if not torch.cuda.is_available():
+            return
+
+        self.system = CacheBase.get_system()
+
+        self.local_cache_path = CacheBase.get_local_cache_path()
+        self.global_cache_path = CacheBase.get_global_cache_path()
+
+    def get_local_cache(self) -> Dict[str, Any]:
+        if not self.local_cache_path.is_file():
+            return {}
+        with open(self.local_cache_path) as local_cache_fp:
+            local_cache = json.load(local_cache_fp)
+        return local_cache["cache"]
+
+    def update_local_cache(self, local_cache: Dict[str, Any]) -> None:
+        if not os.path.exists(self.local_cache_path.parent):
+            os.makedirs(self.local_cache_path.parent, exist_ok=True)
+
+        write_atomic(
+            str(self.local_cache_path),
+            json.dumps({"system": self.system, "cache": local_cache}, indent=4),
+        )
+
+
+class LocalCache(CacheBase):
+    def lookup(self, *keys: str) -> Optional[Dict[str, Any]]:
+        cache = self.get_local_cache()
+
+        sub_cache = cache
+        for key in keys:
+            if key in cache:
+                sub_cache = cache[key]
+            else:
+                return None
+
+        return sub_cache
+
+    def set_value(self, *keys: str, value: Any) -> None:
+        cache = self.get_local_cache()
+
+        sub_cache = cache
+        for key in keys[0:-1]:
+            sub_cache.setdefault(key, {})
+            sub_cache = sub_cache[key]
+        sub_cache[keys[-1]] = value
+
+        self.update_local_cache(cache)
+
+
+class PersistentCache(CacheBase):
+    @functools.lru_cache(None)
+    def get_global_cache(self):
+        if self.global_cache_path is None or not self.global_cache_path.is_file():
+            return {}
+        with open(self.global_cache_path) as global_cache_fp:
+            global_cache = json.load(global_cache_fp)
+        return global_cache["cache"]
+
+    def lookup(
+        self,
+        choices: List[ChoiceCaller],
+        op: str,
+        inputs: str,
+        benchmark: Callable[[Any], Dict[ChoiceCaller, float]],
+    ) -> Dict[ChoiceCaller, float]:
+        """
+        Check to see if we have benchmarked the given choice callers. For each
+        choice caller:
+
+            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
+            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
+            3.
+                a. `max_autotune_gemm=True`: benchmark the choice, update
+                    local_cache[op][inputs][choice], and return the benchmark.
+                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
+        """
+        precision = torch.get_float32_matmul_precision()
+
+        log_stats = partial(log_global_cache_stats, self.system, op, inputs, precision)
+        log_vals = partial(log_global_cache_vals, self.system, op, inputs, precision)
+        log_errors = partial(
+            log_global_cache_errors, self.system, op, inputs, precision
+        )
+        timings = {}
+
+        def check_cache(cache, callback=None) -> bool:
+            """Check if `cache` contains data for all the choices"""
+            hit = True
+            for choice in choices:
+                choice_hash = choice.hash_key()
+                if choice_hash in cache.get(op, {}).get(inputs, {}).get(precision, {}):
+                    # cache hit
+                    timings[choice] = cache[op][inputs][precision][choice_hash]
+                else:
+                    # cache miss
+                    hit = False
+                    break
+            if callback:
+                callback(cached=hit)
+            return hit
+
+        if config.max_autotune or config.max_autotune_gemm:
+            local_cache = self.get_local_cache()
+            # check local cache first since it is data specific to the current machine
+            if not check_cache(local_cache) and not (
+                use_global_cache()
+                and check_cache(self.get_global_cache(), callback=log_stats)
+            ):
+                try:
+                    # re-benchmark everything to try to get consistent numbers from the same machine
+                    timings = benchmark(choices)
+                    assert all(choice in timings for choice in choices)
+                    local_cache.setdefault(op, {})
+                    local_cache[op].setdefault(inputs, {}).setdefault(precision, {})
+                    for choice, timing in timings.items():
+                        local_cache[op][inputs][precision][choice.hash_key()] = timing
+                except RuntimeError as e:
+                    # catch and log autotuning failures
+                    log_errors(e)
+                    raise e
+
+                self.update_local_cache(local_cache)
+
+                timings_to_log = {
+                    choice.hash_key(): timings[choice] for choice in choices
+                }
+                log_vals(timings_to_log)
+        elif use_global_cache():
+            # only check global cache, not local one
+            check_cache(self.get_global_cache(), callback=log_stats)
+            # may have a partial cache hit, where not everything is benchmarked
+
+        return timings
+
+
+def get_lock_dir() -> str:
+    lock_dir = os.path.join(cache_dir(), "locks")
+    if not os.path.exists(lock_dir):
+        os.makedirs(lock_dir, exist_ok=True)
+    return lock_dir
+
+
+def sha256_hash(data: bytes) -> str:
+    # [:51] to strip off the "Q====" suffix common to every hash value.
+    return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower()
+
+
+def code_hash(code: Union[str, bytes], extra: str = ""):
+    hashing_str = code if isinstance(code, bytes) else code.encode("utf-8")
+    if extra != "":
+        hashing_str = hashing_str + b"||" + extra.encode("utf-8")
+    return "c" + sha256_hash(hashing_str)
+
+
+def get_path(
+    basename: str, extension: str, specified_dir: str = ""
+) -> Tuple[str, str, str]:
+    if specified_dir:
+        if os.path.isabs(specified_dir):
+            subdir = specified_dir
+        else:
+            subdir = os.path.join(cache_dir(), specified_dir)
+    else:
+        subdir = os.path.join(cache_dir(), basename[1:3])
+    path = os.path.join(subdir, f"{basename}.{extension}")
+    return basename, subdir, path
+
+
+def get_hash(content: Union[str, bytes], extra: str = "", hash_type: str = "code"):
+    if hash_type == "code":
+        return code_hash(content, extra)
+    if hash_type in ["cubin", "hsaco"]:
+        return code_hash(repr(content))
+    raise AssertionError(f"Unknown hash type {hash_type}")
+
+
+def write(
+    content: Union[str, bytes],
+    extension: str,
+    extra: str = "",
+    hash_type: str = "code",
+    specified_dir: str = "",
+) -> Tuple[str, str]:
+    # use striped content to compute hash so we don't end up with different
+    # hashes just because the content begins/ends with differnet number of
+    # spaces.
+    key: str = get_hash(content.strip(), extra, hash_type)
+    basename, subdir, path = get_path(key, extension, specified_dir)
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    if not os.path.exists(path):
+        write_atomic(path, content)
+    return basename, path
+
+
+def write_atomic(path: str, content: Union[str, bytes]) -> None:
+    # Write into temporary file first to avoid conflicts between threads
+    # Avoid using a named temporary file, as those have restricted permissions
+    assert isinstance(
+        content, (str, bytes)
+    ), "Only strings and byte arrays can be saved in the cache"
+    path = pathlib.Path(path)
+    tmp_path = path.parent / f".{os.getpid()}.{threading.get_ident()}.tmp"
+    write_mode = "w" if isinstance(content, str) else "wb"
+    with tmp_path.open(write_mode) as f:
+        f.write(content)
+    tmp_path.rename(path)
+
+
+@dataclasses.dataclass
+class TensorMetadataAndValues:
+    """
+    TensorMetadata plus the elements as a list of raw values.
+    Used for hashing inlined constants.
+    """
+
+    tensor_metadata: TensorMetadata
+    values: List[Any]
+
+
+def _ident(x: Any) -> Any:
+    return x
+
+
+def _reduce_fake_tensor(t):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
+    """
+    metadata = extract_tensor_metadata(t)
+    return (_ident, (metadata,))
+
+
+def _reduce_tensor(t):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle Tensors.
+    """
+    if t.is_mkldnn:
+        # TODO: These tensors don't currently pickle, so we can't cache a
+        # compiled graph containing them. Just fail now. If mkldnn tensors
+        # get pickling support, we can remove this.
+        raise BypassFxGraphCache()
+
+    # If we see tensors, we know they're constants stored as attributes on
+    # the GraphModule. See tensor lowering; small constants are inlined. If
+    # we see a small tensor, therefore, no reference will ultimately remain
+    # in the generated code. So we need to include its value in the cache key.
+    # Large constants are effectively treated as inputs and we consider only
+    # their metadata.
+    metadata = extract_tensor_metadata(t)
+    if len(t.shape) == 0 or torch._inductor.graph.GraphLowering.can_inline_constant(t):
+        return (_ident, (TensorMetadataAndValues(metadata, t.tolist()),))
+    else:
+        return (_ident, (metadata,))
+
+
+def _reduce_symint(s):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle SymInts.
+    """
+    # For hashing purposes, we only care about the name of the symbol and
+    # not the backed value. We evaluate guards stored with a cached graph
+    # to ensure a cached entity with SymInt args is safe to reuse.
+    return (_ident, (str(s),))
+
+
+class FxGraphCachePickler(pickle.Pickler):
+    """
+    Custom pickler to customize the pickling of some objects (Tensors), only for the
+    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
+    objects that don't pickle and/or vary between runs, and we want to capture the
+    data that allow us to compute a stable, but safe hash.
+    """
+
+    dispatch_table = copyreg.dispatch_table.copy()
+    dispatch_table[FakeTensor] = _reduce_fake_tensor
+    dispatch_table[torch.Tensor] = _reduce_tensor
+    dispatch_table[torch.SymInt] = _reduce_symint
+
+    @staticmethod
+    def dumps(obj) -> bytes:
+        """
+        Pickle an object using the FxGraphCachePickler.
+        """
+        with io.BytesIO() as stream:
+            pickler = FxGraphCachePickler(stream)
+            pickler.dump(obj)
+            return stream.getvalue()
+
+    @staticmethod
+    def get_hash(obj: Any) -> str:
+        """
+        Serialize an object using the FxGraphCachePickler and return a hash
+        of the pickled object.
+        """
+        serialized_data = FxGraphCachePickler.dumps(obj)
+        return sha256_hash(serialized_data)
+
+
+@functools.lru_cache(None)
+def get_inductor_code_hash() -> bytes:
+    """
+    Compute a hash of all inductor code modules. Used by the FxGraph cache
+    so any inductor code changes would result in new cache keys.
+    """
+    inductor_root = os.path.dirname(__file__)
+
+    contents: Dict[str, bytes] = {}
+    for lib in pkgutil.iter_modules([inductor_root]):
+        spec = lib.module_finder.find_spec(lib.name, None)
+        assert spec is not None
+        module = spec.origin
+        assert module is not None
+        with open(module, "rb") as f:
+            contents[module] = f.read()
+
+    return hashlib.sha256(pickle.dumps(contents)).digest()
+
+
+@dataclasses.dataclass
+class OrderedSetHolder:
+    """
+    See FxGraphHashDetails. Holds a sorted list to support stable hashing
+    of set kwargs.
+    """
+
+    items: List[Any]
+
+
+class BypassFxGraphCache(Exception):
+    """
+    Exception to indicate that the FxGraphCache should be bypassed.
+    """
+
+    pass
+
+
+class FxGraphHashDetails:
+    """
+    Object to capture all the details for a compiled FX graph relevant to computing
+    a safe and stable cache key.
+    """
+
+    # Excluded kwargs param that are not stable between runs
+    EXCLUDED_KWARGS = ["graph_id"]
+
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        fx_kwargs: Dict[str, Any],
+    ):
+        self.gm = gm
+        self.example_inputs = example_inputs
+
+        # Order kwargs so hashing is stable to changes in kwarg order.
+        self.fx_kwargs = {}
+        for k in sorted(fx_kwargs):
+            if k not in self.EXCLUDED_KWARGS:
+                if type(fx_kwargs[k]) is set:
+                    # Special case to handle set params. Python sets can't be
+                    # ordered, so sort the elements and store them in a proxy.
+                    self.fx_kwargs[k] = OrderedSetHolder(sorted(fx_kwargs[k]))
+                else:
+                    self.fx_kwargs[k] = fx_kwargs[k]
+
+        # 'Deterministic algorithms' can affect codegen via lowering to cuda kernels.
+        self.deterministic_algorithms_settings = (
+            torch.are_deterministic_algorithms_enabled(),
+            torch.is_deterministic_algorithms_warn_only_enabled(),
+            torch.utils.deterministic.fill_uninitialized_memory,  # type: ignore[attr-defined]
+        )
+
+        # Global settings affecting matmul codegen.
+        self.cuda_matmul_settings = (
+            torch.backends.cuda.matmul.allow_tf32,
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction,
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction,
+        )
+
+        # Also hash on various system info (including the triton compiler version).
+        self.torch_version = torch.__version__
+        self.system_info = CacheBase.get_system()
+
+        # And the inductor configuration and code.
+        self.inductor_code_hash = get_inductor_code_hash()
+        try:
+            self.inductor_config = config.save_config()
+        except TypeError as e:
+            # Some configs options are callables, e.g., post_grad_custom_pre_pass,
+            # and may not pickle.
+            log.debug("Can't pickle inductor config: %s", e)
+            raise BypassFxGraphCache() from e
+
+    def debug_str(self) -> str:
+        """
+        Get a printable string describing in more detail all the attributes
+        comprising this object. Useful for debugging when one graph hashes
+        to a different value than another.
+        """
+
+        def get_str(obj) -> str:
+            if isinstance(obj, torch.Tensor):
+                return str(extract_tensor_metadata(obj))
+            elif isinstance(obj, bytes):
+                return "<bytes>"
+            else:
+                return str(obj)
+
+        lines = []
+        for attr, obj in vars(self).items():
+            if isinstance(obj, list):
+                for ii in range(len(obj)):
+                    h = FxGraphCachePickler.get_hash(obj[ii])
+                    lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}")
+            elif isinstance(obj, dict):
+                for k, v in obj.items():
+                    h = FxGraphCachePickler.get_hash(v)
+                    lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}")
+            else:
+                h = FxGraphCachePickler.get_hash(obj)
+                lines.append(f"[{h}] {attr}: {get_str(obj)}")
+        return "\n".join(lines)
+
+
+def compiled_fx_graph_hash(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    fx_kwargs: Dict[str, Any],
+) -> str:
+    """
+    Generate a unique hash of the FX graph for caching.
+    """
+    details = FxGraphHashDetails(gm, example_inputs, fx_kwargs)
+    # The prefix distinguishes among the other kinds of objects we
+    # cache in this module.
+    key = "f" + FxGraphCachePickler.get_hash(details)
+    log.debug("FX graph cache hash details for key %s:\n%s", key, details.debug_str())
+    return key
+
+
+class FxGraphCache:
+    """
+    Supports caching and reusing compiled Fx graphs.
+
+    The overall strategy is as follows:
+    - This cache stores entries on disk. When saving an entry, we can't
+      serialize callables (that could be C++, Triton, etc.), so we serialize
+      their own disk cache location. We then recreate the compiled artifact
+      after fetching from disk.
+    - For indexing the cache, we gather the fields relevant to identifying an
+      FxGraph (the graph module, graph inputs, system settings etc.) into an
+      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
+      See FxGraphCachePickler.
+    - Among the metadata we store, we also include a guards expression that's
+      appropriate for validating any symbols for Tensor arguments that have
+      symbolic bounds. On cache lookup then, we evaluate those guards in the
+      current context to validate that a cached entry can be served.
+    - A given graph could have multiple compiled versions, corresponding to
+      different sets of guards. Therefore, we store cache entries in the form:
+          <temp dir>/<fx graph hash>/<serialized metatdata>
+    - On lookup, we compute the key from the graph details, iterate over all
+      leaf files in the corresponding subdirectory, deserialize the entry, and
+      evaluate its guards expression. If the evaluation succeeds, we have a
+      cache hit. If it fails, we compile the graph and store a new entry.
+    - Finally, on a cache hit, we need to make sure any guards that would
+      have been created during compilation are added to the current context.
+    """
+
+    # TODO(masnesral): Investigate whether it's beneficial to store compiled graphs
+    # in an in-memory cache after loading from disk.
+    @staticmethod
+    def _get_tmp_dir() -> str:
+        """
+        Get the toplevel temporary directory for storing compiled graphs.
+        """
+        return os.path.join(cache_dir(), "fxgraph")
+
+    @staticmethod
+    def _get_tmp_dir_for_key(key: str) -> str:
+        """
+        Return the disk location for a given cache key.
+        """
+        return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key)
+
+    @staticmethod
+    def _filter_symints(inputs: List[Any]) -> List[torch.SymInt]:
+        """
+        Get the SymInt objects from the input list.
+        """
+        return [s for s in inputs if isinstance(s, torch.SymInt)]
+
+    @staticmethod
+    def _get_shape_env() -> Optional[ShapeEnv]:
+        """
+        Helper to get the shape env from the tracing context.
+        """
+        ctx = torch._guards.TracingContext.try_get()
+        if not ctx:
+            return None
+        return ctx.fake_mode.shape_env
+
+    @staticmethod
+    def _lookup_graph(
+        key: str,
+        example_inputs: List[torch.Tensor],
+    ) -> Optional[CompiledFxGraph]:
+        """
+        Lookup a compiled graph in the cache by key. On a hit, return the
+        deserialized CompiledFxGraph object. On a miss, return None.
+        """
+        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+        if not os.path.exists(subdir):
+            return None
+
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+
+        # Iterate over any entries in the subdir for this key and evaluate
+        # their guards to determine whether there's a hit.
+        graph = None
+
+        for path in sorted(os.listdir(subdir)):
+            with open(os.path.join(subdir, path), "rb") as f:
+                candidate: CompiledFxGraph = pickle.load(f)
+
+            guards_expr = candidate.guards_expr
+            if not guards_expr:
+                # No guards to evaluate, so this is a hit.
+                graph = candidate
+                break
+
+            # Evaluate the guard expression in the current context.
+            symints = FxGraphCache._filter_symints(example_inputs)
+
+            # If there's not a cache hit, we don't want the evaluation to
+            # affect the current env, e.g., cause the creation of new guards,
+            # so we evaluate with the hints instead of the symbols.
+            assert all(has_hint(s) for s in symints)
+            hints = [hint_int(s) for s in symints]
+            hit = bool(shape_env.evaluate_guards_expression(guards_expr, hints))
+            log.debug(
+                "fx graph cache key %s evaluating guards for %s with values %s => %s",
+                key,
+                guards_expr,
+                hints,
+                hit,
+            )
+            if hit:
+                # Now re-evaluate with the symints to add any guards to the current env.
+                check = bool(shape_env.evaluate_guards_expression(guards_expr, symints))
+                assert check is True
+                log.debug(
+                    "fx graph cache key %s post-load guards: %s", key, shape_env.guards
+                )
+                graph = candidate
+                break
+
+        # Increment the cached metrics by the amounts recorded when the FX
+        # graph was compiled for this cache entry. Pretending these counters
+        # were incremented normally is useful for testing with the cache enabled.
+        if graph is not None:
+            metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas)
+
+        return graph
+
+    @staticmethod
+    def _save_graph(
+        key: str, compiled_graph: CompiledFxGraph, example_inputs: List[torch.Tensor]
+    ):
+        """
+        Store a serialized CompiledFxGraph on disk.
+        """
+        disk_compiled_graph = copy(compiled_graph)
+        # Important as compiled models are not pickleable:
+        disk_compiled_graph.compiled_artifact = None
+
+        # Before serializing, compute the guard expression that will be used to
+        # ensure that a CompiledFxGraph is valid when loaded from the cache. It's
+        # sufficient to consider only the SymInt args to the fx graph since the
+        # Tensor shapes are already captured in the hash for the cache key. Any
+        # Tensor arg with a symbolic shape will have a SymInt arg for the graph.
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+        symints = FxGraphCache._filter_symints(example_inputs)
+        disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints)
+
+        try:
+            content = pickle.dumps(disk_compiled_graph)
+        except Exception as e:
+            log.debug("fx graph cache unable to serialize compiled graph: %s", e)
+            counters["inductor"]["fxgraph_cache_pickle_error"] += 1
+            return
+
+        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+        if not os.path.exists(subdir):
+            os.makedirs(subdir, exist_ok=True)
+
+        # Use a hash of the serialized CompiledFxGraph to get a unique file
+        # name. The specific name doesn't matter since a lookup involves
+        # iterating over all entries in the parent subdir.
+        path = os.path.join(subdir, sha256_hash(content))
+        write_atomic(path, content)
+
+    @staticmethod
+    def _check_can_cache():
+        """
+        Check some conditions that would preclude caching and raise BypassFxGraphCache
+        to bypass in case caching is not possible.
+        """
+        if config.freezing or config.aot_inductor.use_runtime_constant_folding:
+            # Freezing can embed constants that wouldn't be static across runs.
+            raise BypassFxGraphCache()
+
+        if FxGraphCache._get_shape_env() is None:
+            # The treatment of guards in the caching implementation requires that
+            # we have a shape env.
+            log.debug("fx graph cache no shape env")
+            raise BypassFxGraphCache()
+
+    @staticmethod
+    def load(
+        compile_fx_fn: Callable[..., Any],
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        fx_kwargs: Dict[str, Any],
+    ):
+        """
+        Load a compiled graph from the cache. If a cached entry does not exist,
+        compile the graph and save it to the cache.
+        """
+        from filelock import FileLock
+
+        compiled_graph = None
+        try:
+            FxGraphCache._check_can_cache()
+            key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs)
+
+            lock_path = os.path.join(get_lock_dir(), key + ".lock")
+            with FileLock(lock_path, timeout=LOCK_TIMEOUT):
+                compiled_graph = FxGraphCache._lookup_graph(key, example_inputs)
+                if compiled_graph is None:
+                    log.debug("fx graph cache miss for key %s", key)
+                    counters["inductor"]["fxgraph_cache_miss"] += 1
+                    compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
+                    FxGraphCache._save_graph(key, compiled_graph, example_inputs)
+                else:
+                    log.debug("fx graph cache hit for key %s", key)
+                    counters["inductor"]["fxgraph_cache_hit"] += 1
+        except BypassFxGraphCache:
+            counters["inductor"]["fxgraph_cache_bypass"] += 1
+
+        if not compiled_graph:
+            compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
+
+        return compiled_graph
+
+    @staticmethod
+    def clear():
+        """
+        Clear out the on-disk cache.
+        """
+        try:
+            shutil.rmtree(FxGraphCache._get_tmp_dir())
+        except FileNotFoundError:
+            pass
+
+
+@dataclasses.dataclass
+class CompiledFxGraph:
+    """
+    Class holding a compiled FX graph. This is the object serialized on disk
+    to support FxGraph caching.
+    """
+
+    compiled_artifact: Optional[Callable[..., Any]]
+    current_callable: Optional[Callable[..., Any]]
+    cache_key: Optional[str]
+    artifact_path: Optional[str]
+    cache_linemap: Optional[List[Tuple[int, str]]]
+    device_types: Set[str]
+    device_idxs: Set[int]
+    mutated_inputs: Set[str]
+    mutated_input_idxs: Set[int]
+    constants: Dict[str, torch.Tensor]
+    output_strides: Optional[List[Optional[Tuple[int, ...]]]]
+    disabled_cudagraphs_reason: Optional[str]
+    metrics_deltas: metrics.CachedMetricsDeltas
+    # This is a string representation of an expression we serialize
+    # with the object so the guards can be evaluated in a different
+    # context in order to verify the validity of serving a cached
+    # fx graph. The expression must be generated by:
+    # ShapeEnv.produce_guards_expression()
+    guards_expr: Optional[str]
+
+    _boxed_call: Optional[bool] = None
+
+    def __init__(
+        self,
+        compiled_artifact: Optional[Callable[..., Any]],
+        graph: GraphLowering,
+        output_strides: List[Optional[Tuple[int, ...]]],
+        disabled_cudagraphs_reason: Optional[str],
+        metrics_deltas: metrics.CachedMetricsDeltas,
+    ):
+        self.compiled_artifact = compiled_artifact
+        self.current_callable = None
+        self.cache_key = graph.cache_key
+        self.artifact_path = graph.cache_path
+        self.cache_linemap = graph.cache_linemap
+        self.device_types = graph.device_types
+        self.device_idxs = graph.device_idxs
+        self.mutated_inputs = graph.mutated_inputs
+        self.mutated_input_idxs = set(graph.mutated_input_idxs)
+        self.constants = graph.constants
+        self.output_strides = output_strides
+        self.disabled_cudagraphs_reason = disabled_cudagraphs_reason
+        self.metrics_deltas = metrics_deltas
+        self.guards_expr = None
+
+    def __call__(self, inputs: List[Any]) -> Any:
+        return self.get_current_callable()(inputs)
+
+    def get_current_callable(self) -> Callable[..., Any]:
+        if self.current_callable is None:
+            # This prevents a circular reference that makes CompiledFxGraph
+            # get stuck without getting garbage collected
+            return functools.partial(_run_from_cache, weakref.proxy(self))
+        else:
+            return self.current_callable
+
+
+def _run_from_cache(compiled_graph: CompiledFxGraph, inputs: List[Any]) -> Any:
+    # We can't really serialize callables that may be C++/Triton/etc.,
+    # so we serialize their disk cache location instead
+    # TODO: When making an API that can save compiled models e2e to disk
+    # this will need to be better
+    if compiled_graph.compiled_artifact is None:
+        from .codecache import PyCodeCache
+
+        assert compiled_graph.cache_key
+        assert compiled_graph.artifact_path
+        compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path(
+            compiled_graph.cache_key,
+            compiled_graph.artifact_path,
+            compiled_graph.cache_linemap,
+            compiled_graph.constants,
+        ).call
+
+    return compiled_graph.compiled_artifact(inputs)
+
+
+def cpp_compiler() -> str:
+    if config.is_fbcode():
+        return build_paths.cc()
+    if isinstance(config.cpp.cxx, (list, tuple)):
+        search = tuple(config.cpp.cxx)
+    else:
+        search = (config.cpp.cxx,)
+    return cpp_compiler_search(search)
+
+
+@functools.lru_cache(1)
+def cpp_compiler_search(search: str) -> str:
+    for cxx in search:
+        try:
+            if cxx is None:
+                # gxx package is only available for Linux
+                # according to https://anaconda.org/conda-forge/gxx/
+                if sys.platform != "linux":
+                    continue
+                # Do not install GXX by default
+                if not os.getenv("TORCH_INDUCTOR_INSTALL_GXX"):
+                    continue
+                from filelock import FileLock
+
+                lock_dir = get_lock_dir()
+                lock = FileLock(
+                    os.path.join(lock_dir, "g++.lock"), timeout=LOCK_TIMEOUT
+                )
+                with lock:
+                    cxx = install_gcc_via_conda()
+            subprocess.check_output([cxx, "--version"])
+            return cxx
+        except (subprocess.SubprocessError, FileNotFoundError, ImportError):
+            continue
+    raise exc.InvalidCxxCompiler()
+
+
+def install_gcc_via_conda() -> str:
+    """On older systems, this is a quick way to get a modern compiler"""
+    prefix = os.path.join(cache_dir(), "gcc")
+    cxx_path = os.path.join(prefix, "bin", "g++")
+    if not os.path.exists(cxx_path):
+        log.info("Downloading GCC via conda")
+        conda = os.environ.get("CONDA_EXE", "conda")
+        if conda is None:
+            conda = shutil.which("conda")
+        if conda is not None:
+            subprocess.check_call(
+                [
+                    conda,
+                    "create",
+                    f"--prefix={prefix}",
+                    "--channel=conda-forge",
+                    "--quiet",
+                    "-y",
+                    "python=3.8",
+                    "gxx",
+                ],
+                stdout=subprocess.PIPE,
+            )
+    return cxx_path
+
+
+def is_gcc() -> bool:
+    return bool(re.search(r"(gcc|g\+\+)", cpp_compiler()))
+
+
+def is_clang() -> bool:
+    return bool(re.search(r"(clang|clang\+\+)", cpp_compiler()))
+
+
+@functools.lru_cache(None)
+def is_apple_clang() -> bool:
+    cxx = cpp_compiler()
+    version_string = subprocess.check_output([cxx, "--version"]).decode("utf8")
+    return "Apple" in version_string.splitlines()[0]
+
+
+class VecISA:
+    _bit_width: int
+    _macro: str
+    _arch_flags: str
+    _dtype_nelements: Dict[torch.dtype, int]
+
+    # Note [Checking for Vectorized Support in Inductor]
+    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
+    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
+    # like exp, pow, sin, cos and etc.
+    # But PyTorch and TorchInductor might use different compilers to build code. If
+    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
+    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
+    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
+    # gcc/g++ compiler by default while it could support the AVX512 compilation.
+    # Therefore, there would be a conflict sleef version between PyTorch and
+    # TorchInductor. Hence, we dry-compile the following code to check whether current
+    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
+    # also needs the logic
+    # In fbcode however, we are using the same compiler for pytorch and for inductor codegen,
+    # making the runtime check unnecessary.
+    _avx_code = """
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+
+__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};
+
+extern "C" void __avx_chk_kernel() {
+    auto tmp0 = at::vec::Vectorized<float>(1);
+    auto tmp1 = tmp0.exp();
+    tmp1.store(in_out_ptr0);
+}
+"""  # noqa: B950
+
+    _avx_py_load = """
+import torch
+from ctypes import cdll
+cdll.LoadLibrary("__lib_path__")
+"""
+
+    def bit_width(self) -> int:
+        return self._bit_width
+
+    def nelements(self, dtype: torch.dtype = torch.float) -> int:
+        return self._dtype_nelements[dtype]
+
+    def build_macro(self) -> str:
+        return self._macro
+
+    def build_arch_flags(self) -> str:
+        return self._arch_flags
+
+    def __hash__(self) -> int:
+        return hash(str(self))
+
+    @functools.lru_cache(None)
+    def __bool__(self) -> bool:
+        if config.cpp.vec_isa_ok is not None:
+            return config.cpp.vec_isa_ok
+
+        if config.is_fbcode():
+            return True
+
+        key, input_path = write(VecISA._avx_code, "cpp")
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            output_path = input_path[:-3] + "so"
+            build_cmd = shlex.split(
+                cpp_compile_command(
+                    input_path, output_path, warning_all=False, vec_isa=self
+                )
+            )
+            try:
+                # Check build result
+                compile_file(input_path, output_path, build_cmd)
+                subprocess.check_call(
+                    [
+                        sys.executable,
+                        "-c",
+                        VecISA._avx_py_load.replace("__lib_path__", output_path),
+                    ],
+                    stderr=subprocess.DEVNULL,
+                    env={**os.environ, "PYTHONPATH": ":".join(sys.path)},
+                )
+            except Exception as e:
+                return False
+
+            return True
+
+
+@dataclasses.dataclass
+class VecAVX512(VecISA):
+    _bit_width = 512
+    _macro = "-DCPU_CAPABILITY_AVX512"
+    _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
+    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+
+    def __str__(self) -> str:
+        return "avx512"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+@dataclasses.dataclass
+class VecAVX2(VecISA):
+    _bit_width = 256
+    _macro = "-DCPU_CAPABILITY_AVX2"
+    _arch_flags = "-mavx2 -mfma"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "avx2"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+@dataclasses.dataclass
+class VecZVECTOR(VecISA):
+    _bit_width = 256
+    _macro = "-DCPU_CAPABILITY_ZVECTOR -DCPU_CAPABILITY=ZVECTOR -DHAVE_ZVECTOR_CPU_DEFINITION"
+    _arch_flags = "-mvx -mzvector"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "zvector"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+class InvalidVecISA(VecISA):
+    _bit_width = 0
+    _macro = ""
+    _arch_flags = ""
+    _dtype_nelements = {}
+
+    def __str__(self) -> str:
+        return "INVALID_VEC_ISA"
+
+    def __bool__(self) -> bool:  # type: ignore[override]
+        return False
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+invalid_vec_isa = InvalidVecISA()
+supported_vec_isa_list = [VecAVX512(), VecAVX2()]
+
+
+# Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
+# might have too much redundant content that is useless for ISA check. Hence,
+# we only cache some key isa information.
+@functools.lru_cache(None)
+def valid_vec_isa_list() -> List[VecISA]:
+    if sys.platform != "linux":
+        return []
+
+    if platform.machine() == "s390x":
+        return [VecZVECTOR()]
+
+    isa_list = []
+    with open("/proc/cpuinfo") as _cpu_info:
+        _cpu_info_content = _cpu_info.read()
+        for isa in supported_vec_isa_list:
+            if str(isa) in _cpu_info_content and isa:
+                isa_list.append(isa)
+        return isa_list
+
+
+def pick_vec_isa() -> VecISA:
+    if config.is_fbcode():
+        return VecAVX2()
+
+    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
+    if not _valid_vec_isa_list:
+        return invalid_vec_isa
+
+    # If the simdlen is None, it indicates determin the vectorization length automatically
+    if config.cpp.simdlen is None:
+        assert _valid_vec_isa_list
+        return _valid_vec_isa_list[0]
+
+    for isa in _valid_vec_isa_list:
+        if config.cpp.simdlen == isa.bit_width():
+            return isa
+
+    return invalid_vec_isa
+
+
+def get_compile_only(compile_only: bool = True) -> str:
+    return "-c" if compile_only else ""
+
+
+def get_shared(shared: bool = True, compile_only: bool = False) -> str:
+    if not shared:
+        return ""
+    if compile_only:
+        return "-fPIC"
+    if platform.system() == "Darwin" and "clang" in cpp_compiler():
+        # This causes undefined symbols to behave the same as linux
+        return "-shared -fPIC -undefined dynamic_lookup"
+    else:
+        return "-shared -fPIC"
+
+
+def get_warning_all_flag(warning_all: bool = True) -> str:
+    return "-Wall" if warning_all else ""
+
+
+def get_glibcxx_abi_build_flags() -> str:
+    return "-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))
+
+
+def cpp_flags() -> str:
+    flags = ["-std=c++17", "-Wno-unused-variable", "-Wno-unknown-pragmas"]
+    if is_clang():
+        flags.append("-Werror=ignored-optimization-argument")
+    return " ".join(flags)
+
+
+def cpp_wrapper_flags() -> str:
+    return "-DTORCH_INDUCTOR_CPP_WRAPPER"
+
+
+def optimization_flags() -> str:
+    base_flags = "-O0 -g" if config.aot_inductor.debug_compile else "-O3 -DNDEBUG"
+    base_flags += " -ffast-math -fno-finite-math-only"
+    if not config.cpp.enable_unsafe_math_opt_flag:
+        base_flags += " -fno-unsafe-math-optimizations"
+    if not config.cpp.enable_floating_point_contract_flag:
+        base_flags += " -ffp-contract=off"
+
+    if config.is_fbcode():
+        # FIXME: passing `-fopenmp` adds libgomp.so to the generated shared library's dependencies.
+        # This causes `ldopen` to fail in fbcode, because libgomp does not exist in the default paths.
+        # We will fix it later by exposing the lib path.
+        return base_flags
+
+    if sys.platform == "darwin":
+        # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
+        # Also, `-march=native` is unrecognized option on M1
+        base_flags += " -Xclang"
+    else:
+        if platform.machine() == "ppc64le":
+            base_flags += " -mcpu=native"
+        else:
+            base_flags += " -march=native"
+
+    # Internal cannot find libgomp.so
+    if not config.is_fbcode():
+        base_flags += " -fopenmp"
+    return base_flags
+
+
+def use_custom_generated_macros() -> str:
+    return "-D C10_USING_CUSTOM_GENERATED_MACROS"
+
+
+def use_fb_internal_macros() -> str:
+    if config.is_fbcode():
+        openmp_lib = build_paths.openmp_lib()
+        preprocessor_flags = " ".join(
+            (
+                "-D C10_USE_GLOG",
+                "-D C10_USE_MINIMAL_GLOG",
+                "-D C10_DISABLE_TENSORIMPL_EXTENSIBILITY",
+            )
+        )
+        return f"-Wp,-fopenmp {openmp_lib} {preprocessor_flags}"
+    else:
+        return ""
+
+
+def use_standard_sys_dir_headers() -> str:
+    if config.is_fbcode():
+        return "-nostdinc"
+    else:
+        return ""
+
+
+@functools.lru_cache(None)
+def is_conda_llvm_openmp_installed() -> bool:
+    try:
+        command = "conda list llvm-openmp --json"
+        output = subprocess.check_output(command.split()).decode("utf8")
+        return len(json.loads(output)) > 0
+    except subprocess.SubprocessError:
+        return False
+
+
+@functools.lru_cache(None)
+def homebrew_libomp() -> Tuple[bool, str]:
+    try:
+        # check if `brew` is installed
+        subprocess.check_output(["which", "brew"])
+        # get the location of `libomp` if it is installed
+        # this is the location that `libomp` **would** be installed
+        # see https://github.com/Homebrew/brew/issues/10261#issuecomment-756563567 for details
+        libomp_path = (
+            subprocess.check_output(["brew", "--prefix", "libomp"])
+            .decode("utf8")
+            .strip()
+        )
+        # check if `libomp` is installed
+        omp_available = os.path.exists(libomp_path)
+        return omp_available, libomp_path
+    except subprocess.SubprocessError:
+        return False, ""
+
+
+def get_include_and_linking_paths(
+    include_pytorch: bool = False,
+    vec_isa: VecISA = invalid_vec_isa,
+    cuda: bool = False,
+    aot_mode: bool = False,
+) -> Tuple[List[str], str, str, str, str]:
+    if (
+        config.is_fbcode()
+        and "CUDA_HOME" not in os.environ
+        and "CUDA_PATH" not in os.environ
+    ):
+        os.environ["CUDA_HOME"] = os.path.dirname(build_paths.cuda())
+    from torch.utils import cpp_extension
+
+    macros = ""
+    build_arch_flags = ""
+    if sys.platform == "linux" and (
+        include_pytorch
+        or vec_isa != invalid_vec_isa
+        or cuda
+        or config.cpp.enable_kernel_profile
+    ):
+        # Note - We include pytorch only on linux right now. There is more work
+        # to do to enable OMP build on darwin where PyTorch is built with IOMP
+        # and we need a way to link to what PyTorch links.
+        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        lpaths = cpp_extension.library_paths(cuda) + [
+            sysconfig.get_config_var("LIBDIR")
+        ]
+
+        libs = []
+
+        # No need to manually specify libraries in fbcode.
+        if not config.is_fbcode():
+            libs += ["torch", "torch_cpu"]
+            libs += ["gomp"]
+            if not aot_mode:
+                libs += ["torch_python"]
+        else:
+            # internal remote execution is able to find omp, but not gomp
+            libs += ["omp"]
+            if aot_mode:
+                ipaths += [os.path.dirname(cpp_prefix_path())]
+                if cuda:
+                    # This is a special treatment for Meta internal cuda-12 where all libs
+                    # are in lib/cuda-12 and lib/cuda-12/stubs
+                    for i, path in enumerate(lpaths):
+                        if path.startswith(
+                            os.environ["CUDA_HOME"]
+                        ) and not os.path.exists(f"{path}/libcudart_static.a"):
+                            for root, dirs, files in os.walk(path):
+                                if "libcudart_static.a" in files:
+                                    lpaths[i] = os.path.join(path, root)
+                                    lpaths.append(os.path.join(lpaths[i], "stubs"))
+                                    break
+        macros = vec_isa.build_macro()
+        if macros:
+            if config.is_fbcode() and vec_isa != invalid_vec_isa:
+                cap = str(vec_isa).upper()
+                macros = " ".join(
+                    [
+                        vec_isa.build_arch_flags(),
+                        f"-D CPU_CAPABILITY={cap}",
+                        f"-D CPU_CAPABILITY_{cap}",
+                        f"-D HAVE_{cap}_CPU_DEFINITION",
+                    ]
+                )
+
+        if cuda:
+            if macros is None:
+                macros = ""
+            macros += " -D USE_ROCM" if torch.version.hip else " -D USE_CUDA"
+
+        if cuda:
+            if torch.version.hip is not None:
+                libs += ["c10_hip", "torch_hip"]
+                macros += " -D __HIP_PLATFORM_AMD__"
+            else:
+                if config.is_fbcode():
+                    libs += ["cuda"]
+                else:
+                    libs += ["c10_cuda", "cuda", "torch_cuda"]
+        build_arch_flags = vec_isa.build_arch_flags()
+    else:
+        # Note - this is effectively a header only inclusion. Usage of some header files may result in
+        # symbol not found, if those header files require a library.
+        # For those cases, include the lpath and libs command as we do for pytorch above.
+        # This approach allows us to only pay for what we use.
+        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        if aot_mode:
+            ipaths += [os.path.dirname(cpp_prefix_path())]
+        lpaths = []
+        if sys.platform == "darwin":
+            # only Apple builtin compilers (Apple Clang++) require openmp
+            omp_available = not is_apple_clang()
+
+            # check the `OMP_PREFIX` environment first
+            if os.getenv("OMP_PREFIX") is not None:
+                header_path = os.path.join(os.getenv("OMP_PREFIX"), "include", "omp.h")  # type: ignore[arg-type]
+                valid_env = os.path.exists(header_path)
+                if valid_env:
+                    ipaths.append(os.path.join(os.getenv("OMP_PREFIX"), "include"))  # type: ignore[arg-type]
+                    lpaths.append(os.path.join(os.getenv("OMP_PREFIX"), "lib"))  # type: ignore[arg-type]
+                else:
+                    warnings.warn("environment variable `OMP_PREFIX` is invalid.")
+                omp_available = omp_available or valid_env
+
+            libs = [] if omp_available else ["omp"]
+
+            # prefer to use openmp from `conda install llvm-openmp`
+            if not omp_available and os.getenv("CONDA_PREFIX") is not None:
+                omp_available = is_conda_llvm_openmp_installed()
+                if omp_available:
+                    conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib")  # type: ignore[arg-type]
+                    ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include"))  # type: ignore[arg-type]
+                    lpaths.append(conda_lib_path)
+                    # Prefer Intel OpenMP on x86 machine
+                    if os.uname().machine == "x86_64" and os.path.exists(
+                        os.path.join(conda_lib_path, "libiomp5.dylib")
+                    ):
+                        libs = ["iomp5"]
+
+            # next, try to use openmp from `brew install libomp`
+            if not omp_available:
+                omp_available, libomp_path = homebrew_libomp()
+                if omp_available:
+                    ipaths.append(os.path.join(libomp_path, "include"))
+                    lpaths.append(os.path.join(libomp_path, "lib"))
+
+            # if openmp is still not available, we let the compiler to have a try,
+            # and raise error together with instructions at compilation error later
+        else:
+            libs = ["omp"] if config.is_fbcode() else ["gomp"]
+
+    # Unconditionally import c10 for non-abi-compatible mode to use TORCH_CHECK - See PyTorch #108690
+    if not config.abi_compatible:
+        libs += ["c10"]
+        lpaths += [cpp_extension.TORCH_LIB_PATH]
+
+    # third party libs
+    if config.is_fbcode():
+        ipaths.append(build_paths.sleef())
+        ipaths.append(build_paths.openmp())
+        ipaths.append(build_paths.cc_include())
+        ipaths.append(build_paths.libgcc())
+        ipaths.append(build_paths.libgcc_arch())
+        ipaths.append(build_paths.libgcc_backward())
+        ipaths.append(build_paths.glibc())
+        ipaths.append(build_paths.linux_kernel())
+        ipaths.append(build_paths.cuda())
+        # We also need to bundle includes with absolute paths into a remote directory
+        # (later on, we copy the include paths from cpp_extensions into our remote dir)
+        ipaths.append("include")
+
+    static_link_libs = []
+    if aot_mode and cuda and config.is_fbcode():
+        # For Meta internal cuda-12, it is recommended to static link cudart
+        static_link_libs = ["-Wl,-Bstatic", "-lcudart_static", "-Wl,-Bdynamic"]
+
+    lpaths_str = " ".join(["-L" + p for p in lpaths])
+    libs_str = " ".join(static_link_libs + ["-l" + p for p in libs])
+    return ipaths, lpaths_str, libs_str, macros, build_arch_flags
+
+
+def cpp_compile_command(
+    input: Union[str, List[str]],
+    output: str,
+    warning_all: bool = True,
+    shared: bool = True,
+    include_pytorch: bool = False,
+    vec_isa: VecISA = invalid_vec_isa,
+    cuda: bool = False,
+    aot_mode: bool = False,
+    compile_only: bool = False,
+    use_absolute_path: bool = False,
+) -> str:
+    ipaths, lpaths, libs, macros, build_arch_flags = get_include_and_linking_paths(
+        include_pytorch, vec_isa, cuda, aot_mode
+    )
+    if isinstance(input, str):
+        input = [input]
+    ipaths_str = " ".join(["-I" + p for p in ipaths])
+    clang_flags = ""
+    if config.is_fbcode():
+        if aot_mode and not use_absolute_path:
+            inp_name = input
+            out_name = output
+            linker_script = _LINKER_SCRIPT
+        else:
+            # We need to copy any absolute-path torch includes
+            inp_name = [os.path.basename(i) for i in input]
+            out_name = os.path.basename(output)
+            linker_script = os.path.basename(_LINKER_SCRIPT)
+        assert is_clang()
+        # Use clang runtime instead of libgcc
+        clang_flags += " --rtlib=compiler-rt"
+        clang_flags += " -fuse-ld=lld"
+        clang_flags += f" -Wl,--script={linker_script}"
+        linker_paths = "-B" + build_paths.glibc_lib()
+        linker_paths += " -L" + build_paths.glibc_lib()
+    else:
+        inp_name = input
+        out_name = output
+        linker_paths = ""  # let the compiler pick
+    if compile_only:
+        libs, lpaths = "", ""
+    inp_name_str = " ".join(inp_name)
+    return re.sub(
+        r"[ \n]+",
+        " ",
+        f"""
+            {cpp_compiler()} {inp_name_str} {get_shared(shared, compile_only)}
+            {get_warning_all_flag(warning_all)} {cpp_flags()}
+            {get_glibcxx_abi_build_flags()}
+            {ipaths_str} {lpaths} {libs} {build_arch_flags}
+            {macros} {linker_paths} {clang_flags}
+            {optimization_flags()}
+            {use_custom_generated_macros()}
+            {use_fb_internal_macros()}
+            {use_standard_sys_dir_headers()}
+            {get_compile_only(compile_only)}
+            -o {out_name}
+        """,
+    ).strip()
+
+
+def run_command_and_check(cmd: str):
+    cmd = shlex.split(cmd)
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as e:
+        raise exc.CppCompileError(cmd, e.output) from e
+
+
+@functools.lru_cache(None)
+def split_aot_inductor_output_path(path: str) -> Tuple[str, str]:
+    """Returns the path where the AOT Inductor compiled kernels are stored."""
+    if path.endswith(".so"):
+        return os.path.split(path)
+    else:
+        return path, ""
+
+
+class CudaKernelParamCache:
+    cache: Dict[str, Dict[str, str]] = dict()
+    clear = staticmethod(cache.clear)
+
+    @classmethod
+    def set(cls, key: str, params: Dict[str, str], cubin: str) -> None:
+        bin_type = "cubin" if torch.version.hip is None else "hsaco"
+        _, path = write(
+            cubin,
+            bin_type,
+            hash_type=bin_type,
+            specified_dir=split_aot_inductor_output_path(
+                config.aot_inductor.output_path
+            )[0],
+        )
+
+        params[get_cpp_wrapper_cubin_path_name()] = path
+
+        cls.cache[key] = params
+
+    @classmethod
+    def get(cls, key: str) -> Optional[Dict[str, str]]:
+        return cls.cache.get(key, None)
+
+    @classmethod
+    def get_keys(cls):
+        return cls.cache.keys()
+
+
+class AotCodeCompiler:
+    @classmethod
+    def compile(
+        cls,
+        graph: GraphLowering,
+        source_code: str,
+        serialized_extern_kernel_nodes: Optional[str],
+        cuda: bool,
+    ) -> str:
+        picked_vec_isa = pick_vec_isa()
+        cpp_command = repr(
+            cpp_compile_command(
+                "i", "o", vec_isa=picked_vec_isa, cuda=cuda, aot_mode=graph.aot_mode
+            )
+        )
+        fbcode_aot_cpu_re = False
+        use_absolute_path = False
+        if config.is_fbcode():
+            ld_command = build_paths.ld()
+            if not cuda and graph.aot_mode:  # Meta internal AOTInductor CPU
+                objcopy_command = build_paths.objcopy_fallback()
+                fbcode_aot_cpu_re = True
+                use_absolute_path = True
+            else:
+                objcopy_command = build_paths.objcopy()
+        else:
+            ld_command = "ld"
+            objcopy_command = "objcopy"
+
+        (
+            specified_output_path,
+            specified_so_name,
+        ) = split_aot_inductor_output_path(config.aot_inductor.output_path)
+        key, input_path = write(
+            source_code,
+            "cpp",
+            extra=cpp_command,
+            specified_dir=specified_output_path,
+        )
+
+        def _compile_consts_linux(consts: bytes) -> str:
+            _, consts_path = write(
+                consts,
+                "bin",
+                specified_dir=specified_output_path,
+            )
+
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            if fbcode_aot_cpu_re:
+                cmd = f"{ld_command} -r -b binary -o {os.path.basename(consts_o)} {os.path.basename(consts_path)}"
+                compile_file(consts_path, consts_o, cmd.split())
+                os.chmod(consts_o, 0o644)
+            else:
+                cmd = f"{ld_command} -r -b binary -o {consts_o} {consts_path}"
+                run_command_and_check(cmd)
+            log.debug("aot constant binary command: %s", cmd)
+
+            cmd = (
+                f"{objcopy_command} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                f" {consts_o} {consts_o}"
+            )
+            log.debug("aot constant obj command: %s", cmd)
+            run_command_and_check(cmd)
+
+            cmd = f"rm {consts_path}"
+            log.debug("aot constant bin removal command: %s", cmd)
+            run_command_and_check(cmd)
+
+            if fbcode_aot_cpu_re:
+                body = re.sub(r"[\W]", "_", os.path.basename(consts_path))
+            else:
+                body = re.sub(r"[\W]", "_", consts_path)
+
+            symbol_list = []
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_start=_binary_constants_bin_start {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_size=_binary_constants_bin_size {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_end=_binary_constants_bin_end {consts_o}"
+            )
+            log.debug("aot constant binary redefine symbol: %s", " ".join(symbol_list))
+            for cmd in symbol_list:
+                run_command_and_check(cmd)
+            return consts_o
+
+        def _compile_consts_darwin(consts: bytes) -> str:
+            is_large_consts = len(consts) > 1024
+            consts_asm = "\t.section\t__TEXT,__const\n"
+            consts_asm += "\t.globl\t__binary_constants_bin_start\n"
+            consts_asm += "__binary_constants_bin_start:\n"
+            if not is_large_consts:
+                for c in consts:
+                    consts_asm += f"\t.byte {c}\n"
+                # Add one element even if constants are empty
+                # Otherwise assembler will not put them in data section
+                if not consts:
+                    consts_asm += "\t.space 1\n"
+            else:
+                consts_asm += "\t.quad 0x1234567899abcdef\n"
+                consts_asm += f"\t.space {len(consts) - 8}\n"
+            consts_asm += ".globl\t__binary_constants_bin_end\n"
+            consts_asm += "__binary_constants_bin_end:\n"
+            _, consts_path = write(
+                consts_asm,
+                "S",
+                specified_dir=specified_output_path,
+            )
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            cmd = f"{cpp_compiler()} -c -o {consts_o} {consts_path}"
+            run_command_and_check(cmd)
+            if is_large_consts:
+                with open(consts_o, "r+b") as f:
+                    f.seek(0)
+                    hdr = f.read(1024)
+                    # Search for magic number and write the actual data over it
+                    start_idx = hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12")
+                    assert start_idx != -1
+                    f.seek(start_idx)
+                    pos = 0
+                    while pos < len(consts):
+                        rc = f.write(consts[pos:])
+                        pos += rc
+            return consts_o
+
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            # Currently, this only support serializing extern nodes in fbcode
+            # Eventually, we should also have a serializer for OSS.
+            if config.is_fbcode() and serialized_extern_kernel_nodes:
+                output_json = os.path.splitext(input_path)[0] + ".json"
+                with open(output_json, "w") as f:
+                    f.write(serialized_extern_kernel_nodes)
+
+            output_so = (
+                config.aot_inductor.output_path
+                if specified_so_name
+                else os.path.splitext(input_path)[0] + ".so"
+            )
+
+            output_o = os.path.splitext(input_path)[0] + ".o"
+            cmd = cpp_compile_command(
+                input=input_path,
+                output=output_o,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                compile_only=True,
+                use_absolute_path=use_absolute_path,
+            )
+            log.debug("aot compilation command: %s", cmd)
+            if fbcode_aot_cpu_re:
+                compile_file(input_path, output_o, cmd.split())
+                os.chmod(output_o, 0o644)
+            else:
+                run_command_and_check(cmd)
+
+            def _to_bytes(t: torch.Tensor) -> bytes:
+                # This serializes the tensor's untyped_storage to bytes by accessing
+                # the raw data of the underlying structure.
+                import ctypes
+
+                if t.numel() == 0:
+                    return b""
+
+                t_cpu = t.untyped_storage().cpu()
+                raw_array = ctypes.cast(
+                    t_cpu.data_ptr(),
+                    ctypes.POINTER(ctypes.c_ubyte * t_cpu.nbytes()),
+                )
+
+                return bytes(raw_array.contents)
+
+            aot_constants = b"".join(
+                _to_bytes(tensor)
+                for name, tensor in graph.constants.items()
+                if name not in graph.folded_constants
+            )
+            consts_o = {
+                "linux": _compile_consts_linux,
+                "darwin": _compile_consts_darwin,
+            }[sys.platform](aot_constants)
+
+            cmd = cpp_compile_command(
+                input=[output_o, consts_o],
+                output=output_so,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                use_absolute_path=use_absolute_path,
+            )
+            log.debug("aot linkage command: %s", cmd)
+            if fbcode_aot_cpu_re:
+                compile_file([output_o, consts_o], output_so, cmd.split())
+                os.chmod(output_so, 0o755)
+            else:
+                run_command_and_check(cmd)
+
+        return output_so
+
+
+# Putting this fn in cpp.py (unfortunately) causes a deadlock, which is why it's in codecache.py.
+# Why? importing from cpp.py invokes codecache.pick_vec_isa(), which takes out a lock.
+# Cycle goes:
+# - CppCodeCache.load()
+# - pick_vec_isa()
+# - valid_vec_isa_list()
+# - VecISA.__bool__() <-- takes out a lock
+# - compile_file() <-- imports cpp_prefix_path from cpp, which causes us to try to take out the same lock.
+@functools.lru_cache
+def cpp_prefix_path() -> str:
+    path = Path(__file__).parent / "codegen/cpp_prefix.h"
+    with path.open() as f:
+        content = f.read()
+        _, filename = write(
+            content,
+            "h",
+        )
+    return filename
+
+
+def cpp_prefix() -> str:
+    filename = cpp_prefix_path()
+    if config.is_fbcode():
+        # We need relative paths, since we bundle up
+        # everything that we compile into a folder for remote compilation.
+        return f'#include "{os.path.basename(filename)}"'
+    else:
+        return f'#include "{filename}"'
+
+
+# Given a path to an input cpp file and an output path,
+# Attempts to compile the file, storing the output in "output_path"
+@dynamo_timed
+def compile_file(
+    input_path: Union[str, List[str]], output_path: str, cmd: List[str]
+) -> None:
+    input_paths = [input_path] if isinstance(input_path, str) else input_path
+    input_files = [
+        os.path.basename(ip) if config.is_fbcode() else ip for ip in input_paths
+    ]
+    try:
+        if config.is_fbcode():
+            # Need to copy our header into the same folder as the sourcecode.
+            header_path = cpp_prefix_path()
+            header_name = os.path.basename(header_path)
+            output_name = os.path.basename(output_path)
+            # When we build remotely, we need to make sure to carefully copy any files
+            # that are required during the compilation process into our build directly.
+            # This is where all of the ATen/c10/Torch includes come from.
+            torch_includes_path = os.path.join(_TORCH_PATH, "include")
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                # Copy everything to tmp compilation folder
+                shutil.copy(header_path, os.path.join(tmp_dir, header_name))
+                shutil.copy(_LINKER_SCRIPT, os.path.join(tmp_dir, "script.ld"))
+                for p, f in zip(input_paths, input_files):
+                    shutil.copy(p, os.path.join(tmp_dir, f))
+                dest_include_path = os.path.join(tmp_dir, "include")
+                shutil.copytree(torch_includes_path, dest_include_path)
+                # Run the build
+                output_file_path = _run_build_command(cmd, tmp_dir, output_name)
+                # Copy output from the build
+                if os.path.exists(output_path):
+                    os.remove(output_path)
+                shutil.copy(output_file_path, output_path)
+        else:
+            subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        output = e.output.decode("utf-8")
+        openmp_problem = "'omp.h' file not found" in output or "libomp" in output
+        if openmp_problem and sys.platform == "darwin":
+            instruction = (
+                "\n\nOpenMP support not found. Please try one of the following solutions:\n"
+                "(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ "
+                "that has builtin OpenMP support;\n"
+                "(2) install OpenMP via conda: `conda install llvm-openmp`;\n"
+                "(3) install libomp via brew: `brew install libomp`;\n"
+                "(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path"
+                " with `include/omp.h` under it."
+            )
+            output += instruction
+        raise exc.CppCompileError(cmd, output) from e
+
+
+_libgomp: Optional[CDLL] = None
+
+
+class CppCodeCache:
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags: Dict[str, Any] = {}
+
+    @staticmethod
+    def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]:
+        return cdll.LoadLibrary(path)
+
+    @classmethod
+    def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
+        try:
+            return cls._load_library_inner(path, key)
+        except (ImportError, OSError) as e:
+            if "gomp" in str(e) and os.path.exists("/usr/lib64/libgomp.so.1"):
+                # hacky workaround for fbcode/buck
+                global _libgomp
+                _libgomp = cdll.LoadLibrary("/usr/lib64/libgomp.so.1")
+                return cls._load_library_inner(path, key)
+            if "failed to map segment from shared object" in str(e):
+                raise OSError(
+                    f"{e}.  The most common reason this may occur is if the {tempfile.gettempdir()} folder "
+                    "is mounted with noexec (e.g., by default Docker mounts tmp file systems "
+                    f"as noexec).  Please remount {tempfile.gettempdir()} with exec enabled, or set another "
+                    "temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable."
+                ) from e
+            raise
+
+    @classmethod
+    def load(cls, source_code: str, cuda: bool = False) -> Union[CDLL, ModuleType]:
+        cls.cpp_compile_command_flags.update({"cuda": cuda})
+        picked_vec_isa = pick_vec_isa()
+        cpp_command = repr(
+            cpp_compile_command(
+                "i", "o", vec_isa=picked_vec_isa, **cls.cpp_compile_command_flags
+            )
+        )
+        key, input_path = write(source_code, "cpp", extra=cpp_command)
+        if key not in cls.cache:
+            from filelock import FileLock
+
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                output_path = input_path[:-3] + "so"
+                if not os.path.exists(output_path):
+                    cmd = shlex.split(
+                        cpp_compile_command(
+                            input=input_path,
+                            output=output_path,
+                            vec_isa=picked_vec_isa,
+                            **cls.cpp_compile_command_flags,
+                        )
+                    )
+                    compile_file(input_path, output_path, cmd)
+                cls.cache[key] = cls._load_library(output_path, key)
+                cls.cache[key].key = key  # type: ignore[union-attr]
+
+        return cls.cache[key]
+
+
+# Customized Python binding for cpp kernels
+class CppPythonBindingsCodeCache(CppCodeCache):
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        # kernels have no dependency on libtorch
+        "include_pytorch": False,
+        "shared": True,
+    }
+    entry_function = "kernel"
+    call_entry_function = "kernel(%s);Py_RETURN_NONE;"
+    extra_parse_arg = ""
+    suffix_template = textwrap.dedent(
+        """
+        // Python bindings to call %s():
+        #define PY_SSIZE_T_CLEAN
+        #include <Python.h>
+        #include <sstream>
+        #include <cstdlib>
+
+        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
+        // We manually link it below to workaround issues with fbcode build.
+        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);
+
+        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
+            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
+            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
+        }
+        template <> inline long parse_arg<long>(PyObject* args, size_t n) {
+            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
+            if(result == -1 && PyErr_Occurred())
+                [[unlikely]] throw std::runtime_error("expected int arg");
+            return result;
+        }
+
+        %s
+
+        static PyObject* %s_py(PyObject* self, PyObject* args) {
+            try {
+                if(!PyTuple_CheckExact(args))
+                    [[unlikely]] throw std::runtime_error("tuple args required");
+                if(PyTuple_GET_SIZE(args) != %s)
+                    [[unlikely]] throw std::runtime_error("requires %s args");
+                %s
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return nullptr;
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return nullptr;
+            }
+        }
+
+        static PyMethodDef py_methods[] = {
+            {"%s", %s_py, METH_VARARGS, ""},
+            {NULL, NULL, 0, NULL}};
+
+        static struct PyModuleDef py_module =
+            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};
+
+        PyMODINIT_FUNC PyInit_%s(void) {
+            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
+            if(!str_addr) {
+                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
+                return nullptr;
+            }
+            std::istringstream iss(str_addr);
+            uintptr_t addr = 0;
+            iss >> addr;
+            _torchinductor_pyobject_tensor_data_ptr =
+                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
+            return PyModule_Create(&py_module);
+        }
+        """
+    )
+
+    @classmethod
+    def _load_library_inner(cls, path: str, key: str) -> ModuleType:
+        os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str(
+            torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
+        )
+        return importlib.machinery.ExtensionFileLoader(
+            f"{key}.{cls.entry_function}", path
+        ).load_module()  # type: ignore[call-arg]
+
+    @classmethod
+    def load_pybinding(
+        cls,
+        argtypes: List[str],
+        source_code: str,
+        cuda: bool = False,
+        num_outputs: int = -1,
+    ) -> Any:
+        """
+        Wrap a C++ function in fast Python bindings.
+
+        Args:
+            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
+            source_code: C++ source code containing a ENTRY_FUNCTION() function
+
+        Returns:
+            A python version of ENTRY_FUNCTION()
+        """
+        parseargs = ", ".join(
+            f"parse_arg<{argtype.replace('const ', '')}>(args, {n})"
+            for n, argtype in enumerate(argtypes)
+        )
+        suffix = cls.suffix_template % (
+            cls.entry_function,
+            cls.extra_parse_arg % num_outputs if cls.extra_parse_arg else "",
+            cls.entry_function,
+            len(argtypes),
+            len(argtypes),
+            cls.call_entry_function % parseargs,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+        )
+        result = cls.load(source_code + suffix, cuda)
+        assert isinstance(result, ModuleType)
+        return getattr(result, cls.entry_function)
+
+
+class CppWrapperCodeCache(CppPythonBindingsCodeCache):
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        "include_pytorch": True,
+        "shared": True,
+    }
+    entry_function = "inductor_entry_cpp"
+    call_entry_function = "return THPVariable_WrapList(inductor_entry_cpp(%s));"
+    extra_parse_arg = textwrap.dedent(
+        """
+        #include <torch/csrc/autograd/python_variable.h>
+        #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+
+        template <> inline std::vector<at::Tensor> parse_arg<std::vector<at::Tensor>>(PyObject* args, size_t n) {
+            return THPVariable_UnpackList(PyTuple_GET_ITEM(args, n));
+        }
+
+        std::vector<at::Tensor> inductor_entry_cpp(std::vector<at::Tensor>&& inputs) {
+            auto input_handles = unsafe_alloc_new_handles_from_tensors(inputs);
+            // For outputs, we only allocate a vector to hold returned tensor handles,
+            // not allocating the actual output tensor storage here
+            std::vector<AtenTensorHandle> output_handles(%s);
+
+            try {
+                inductor_entry_impl(input_handles.data(), output_handles.data());
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return {};
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return {};
+            }
+
+            return alloc_tensors_by_stealing_from_handles(output_handles.data(), output_handles.size());
+        }
+        """
+    )
+
+
+class PyCodeCache:
+    cache: Dict[str, ModuleType] = dict()
+    linemaps: Dict[str, List[Tuple[Any, ...]]] = dict()
+    clear = staticmethod(cache.clear)
+
+    @classmethod
+    def write(cls, source_code: str, extra: str = "") -> Tuple[str, str]:
+        return write(source_code, "py", extra=extra)
+
+    @classmethod
+    def load(
+        cls,
+        source_code: str,
+        extra: str = "",
+        linemap: Optional[List[Tuple[int, str]]] = None,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> ModuleType:
+        key, path = write(source_code, "py", extra=extra)
+        return cls.load_by_key_path(key, path, linemap, attrs)
+
+    @classmethod
+    def load_by_key_path(
+        cls,
+        key: str,
+        path: str,
+        linemap: Optional[List[Tuple[int, str]]] = None,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> ModuleType:
+        if linemap is None:
+            linemap = []
+        if key not in cls.cache:
+            with open(path) as f:
+                try:
+                    code = compile(f.read(), path, "exec")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to import {path}\n{type(e).__name__}: {e}"
+                    ) from None
+                mod = ModuleType(f"{__name__}.{key}")
+                mod.__file__ = path
+                mod.key = key  # type: ignore[attr-defined]
+                exec(code, mod.__dict__, mod.__dict__)
+                sys.modules[mod.__name__] = mod
+                # another thread might set this first
+                cls.cache.setdefault(key, mod)
+                # unzip into separate lines/nodes lists
+                cls.linemaps[path] = list(zip(*linemap))
+
+                if attrs is not None:
+                    for k, v in attrs.items():
+                        setattr(mod, k, v)
+
+        return cls.cache[key]
+
+    @classmethod
+    @functools.lru_cache(None)
+    def stack_frames_for_code(
+        cls, path: str, lineno: int
+    ) -> Optional[List[Dict[str, Any]]]:
+        if path not in cls.linemaps:
+            return None
+        # [(starting_line, <fx node>), ...]
+        lines, nodes = cls.linemaps[path]
+        p = bisect_right(lines, lineno)
+        if p == 0:
+            return None
+        entry = nodes[p - 1]
+        if not entry:
+            return None
+
+        def parse_stack_trace(stack_trace: str) -> List[Dict[str, Any]]:
+            # ideally fx stores stack traces as data rather than a string
+            # but this is not along a performance critical path
+            regex = r'File "(.+)", line (\d+), in (.+)\n'
+            matches = re.findall(regex, stack_trace)
+            return [
+                {"filename": f, "line": int(l), "name": n}
+                for f, l, n in reversed(matches)
+            ]
+
+        return parse_stack_trace(entry)
+
+
+class TritonCodeCache:
+    @classmethod
+    def load(cls, kernel_name: str, source_code: str) -> ModuleType:
+        mod = PyCodeCache.load(source_code)
+        return getattr(mod, kernel_name)
+
+
+def _cuda_compiler() -> Optional[str]:
+    if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
+        return config.cuda.cuda_cxx
+    if cuda_env.nvcc_exist(os.getenv("CUDACXX")):
+        return os.getenv("CUDACXX", "")
+    if cuda_env.nvcc_exist(os.getenv("CUDA_HOME")):
+        return os.path.join(os.getenv("CUDA_HOME", ""), "bin/nvcc")
+    return "nvcc"
+
+
+def _cutlass_include_paths() -> List[str]:
+    cutlass_path = config.cuda.cutlass_dir
+    return [
+        os.path.join(cutlass_path, "include"),
+        os.path.join(cutlass_path, "tools/library/include"),
+        os.path.join(cutlass_path, "tools/library/src"),
+        os.path.join(cutlass_path, "tools/util/include"),
+    ]
+
+
+def _cuda_lib_options() -> List[str]:
+    from torch.utils import cpp_extension
+
+    extra_ldflags: List[str] = []
+    if is_linux():
+        extra_lib_dir = "lib64"
+        if not os.path.exists(
+            cpp_extension._join_cuda_home(extra_lib_dir)
+        ) and os.path.exists(cpp_extension._join_cuda_home("lib")):
+            # 64-bit CUDA may be installed in "lib"
+            # Note that it's also possible both don't exist (see _find_cuda_home) - in that case we stay with "lib64"
+            extra_lib_dir = "lib"
+        extra_ldflags.append(f"-L{cpp_extension._join_cuda_home(extra_lib_dir)}")
+        extra_ldflags.append(
+            f'-L{cpp_extension._join_cuda_home(extra_lib_dir, "stubs")}'
+        )
+        extra_ldflags.append("-lcuda")
+        extra_ldflags.append("-lcudart")
+    else:
+        raise NotImplementedError(
+            "Unsupported env, failed to find cuda libs! Currently only Linux is supported."
+        )
+    return extra_ldflags
+
+
+def _nvcc_host_compiler_options() -> List[str]:
+    return [
+        "-fPIC",
+        "-fno-strict-aliasing",
+        "-fvisibility=hidden",
+        "-Wconversion",
+    ]
+
+
+def _nvcc_compiler_options() -> List[str]:
+    arch = cuda_env.get_cuda_arch()
+    if arch == "90":
+        # Required by cutlass compilation.
+        arch = "90a"
+    code = [f"sm_{arch}", f"compute_{arch}"]
+    if config.cuda.enable_cuda_lto:
+        code += [f"lto_{arch}"]
+    options = [
+        "-t=0",
+        "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+        "-w",
+        f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
+        config.cuda.compile_opt_level,
+        "-std=c++17",
+        "--expt-relaxed-constexpr",
+        "-DNDEBUG",
+    ]
+    if config.cuda.enable_debug_info:
+        options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"])
+    if config.cuda.enable_ptxas_info:
+        options.extend(
+            [
+                "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                "--source-in-ptx",
+            ]
+        )  # Annotate the ptx file with source information
+    if config.cuda.use_fast_math:
+        options.extend(
+            [
+                "--use_fast_math",
+                "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+            ]
+        )
+    return options
+
+
+def cuda_compile_command(
+    src_files: List[str],
+    dst_file: str,
+    dst_file_ext: str,
+) -> str:
+    include_paths = _cutlass_include_paths()
+    cuda_lib_options = _cuda_lib_options()
+    nvcc_host_compiler_options = _nvcc_host_compiler_options()
+    nvcc_compiler_options = _nvcc_compiler_options()
+    options = (
+        nvcc_compiler_options
+        + [
+            f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+            for opt in nvcc_host_compiler_options
+        ]
+        + ["-I" + path for path in include_paths]
+        + cuda_lib_options
+    )
+    src_file = " ".join(src_files)
+    res = ""
+    if dst_file_ext == "o":
+        res = f"{_cuda_compiler()} {' '.join(options)} -c -o {dst_file} {src_file}"
+    elif dst_file_ext == "so":
+        options.append("-shared")
+        res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
+    else:
+        raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!")
+    log.debug("CUDA command: %s", res)
+    return res
+
+
+class DLLWrapper:
+    """A wrapper for a dynamic library."""
+
+    def __init__(
+        self,
+        lib_path: str,
+    ):
+        self.lib_path = lib_path
+        self.DLL = cdll.LoadLibrary(lib_path)
+        self.is_open = True
+
+    def close(self):
+        if self.is_open:
+            self._dlclose()
+            self.is_open = False
+
+    def _dlclose(self):
+        f_dlclose = None
+
+        if is_linux():
+            syms = CDLL(None)
+            if not hasattr(syms, "dlclose"):
+                # Apline Linux
+                syms = CDLL("libc.so")
+
+            if hasattr(syms, "dlclose"):
+                f_dlclose = syms.dlclose
+        else:
+            raise NotImplementedError("Unsupported env, failed to do dlclose!")
+
+        if f_dlclose is not None:
+            f_dlclose.argtypes = [c_void_p]
+            f_dlclose(self.DLL._handle)
+        else:
+            log.warning(
+                "dll unloading function was not found, library may not be unloaded properly!"
+            )
+
+    def __getattr__(self, name):
+        if not self.is_open:
+            raise RuntimeError(f"Cannot use closed DLL library: {self.lib_path}")
+
+        method = getattr(self.DLL, name)
+
+        def _wrapped_func(*args):
+            err = method(*args)
+            if err:
+                raise RuntimeError(f"Error in function: {method.__name__}")
+
+        return _wrapped_func
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+    def __del__(self):
+        self.close()
+
+
+class CUDACodeCache:
+    @dataclasses.dataclass
+    class CacheEntry:
+        input_path: str
+        output_path: str
+
+    cache: Dict[str, CacheEntry] = dict()
+    clear = staticmethod(cache.clear)
+    _SOURCE_CODE_SUFFIX = "cu"
+
+    @classmethod
+    def write(cls, source_code, dst_file_ext) -> Tuple[str, str]:
+        """
+        Writes source code into a file with dst_file_ext as the file extension.
+        Returns the hash key of source code, and the path to the file.
+        """
+
+        cuda_command = repr(
+            cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
+        )
+        key, input_path = write(
+            source_code, cls._SOURCE_CODE_SUFFIX, extra=cuda_command
+        )
+        return key, input_path
+
+    @classmethod
+    def compile(cls, source_code, dst_file_ext) -> Tuple[str, str, str]:
+        """
+        Compiles CUDA source_code into a file with dst_file_ext extension.
+        Returns a tuple of dst_file_path, hash_key, source_code_path
+        """
+
+        key, input_path = cls.write(source_code, dst_file_ext)
+        if key not in cls.cache:
+            from filelock import FileLock
+
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
+                if not os.path.exists(output_path):
+                    cmd = cuda_compile_command(
+                        [input_path], output_path, dst_file_ext
+                    ).split(" ")
+                    try:
+                        subprocess.check_output(
+                            cmd, stderr=subprocess.STDOUT, env=os.environ
+                        )
+                    except subprocess.CalledProcessError as error:
+                        raise exc.CUDACompileError(cmd, error.output) from error
+                cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path)
+
+        return (cls.cache[key].output_path, key, input_path)
+
+    @classmethod
+    def load(cls, source_code, dst_file_ext) -> Tuple[DLLWrapper, str, str]:
+        """
+        Compiles source code and loads the generated .so file.
+        Returns a tuple of DLLWrapper, hash_key, source_code_path
+        """
+
+        if dst_file_ext != "so":
+            raise RuntimeError(
+                f"Only support loading a .so file for now. "
+                f"Requested file extension: {dst_file_ext}. Source code: {source_code}"
+            )
+        dst_file_path, hash_key, source_code_path = cls.compile(
+            source_code, dst_file_ext
+        )
+        return (DLLWrapper(dst_file_path), hash_key, source_code_path)
+
+
+def caching_device_properties():
+    for _, device_interface in get_registered_device_interfaces():
+        if device_interface.is_available():
+            device_interface.Worker.get_device_properties()
+
+
+def _set_triton_ptxas_path() -> None:
+    if os.environ.get("TRITON_PTXAS_PATH") is not None:
+        return
+    ptxas_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
+    )
+    if not os.path.exists(ptxas_path):
+        return
+    if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
+        os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+    else:
+        warnings.warn(f"{ptxas_path} exists but is not an executable")
+
+
+def _worker_compile(
+    kernel_name: str, source_code: str, cc: int, device: torch.device
+) -> None:
+    device_interface = get_interface_for_device(device.type)
+    device_interface.Worker.set_device(device.index)
+    kernel = TritonCodeCache.load(kernel_name, source_code)
+    kernel.precompile(warm_cache_only_with_cc=cc)
+
+
+def _load_kernel(kernel_name: str, source_code: str) -> ModuleType:
+    _set_triton_ptxas_path()
+    kernel = TritonCodeCache.load(kernel_name, source_code)
+    kernel.precompile()
+    return kernel
+
+
+class TritonFuture:
+    kernel: ModuleType
+
+    def __init__(
+        self,
+        kernel_name: str,
+        source_code: str,
+        future: Future[Any],
+    ) -> None:
+        self.kernel_name = kernel_name
+        self.source_code = source_code
+        self.future = future
+
+    # @dynamo_utils.dynamo_timed
+    def result(self) -> ModuleType:
+        t0 = time()
+        if hasattr(self, "kernel"):
+            return self.kernel
+        # If the worker failed this will throw an exception.
+        self.future.result()
+        kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
+        latency = time() - t0
+        if latency > 50:
+            developer_warning(
+                f"Detected long compilation time of {latency} seconds for kernel name {self.kernel_name}"
+            )
+            developer_warning(self.source_code)
+        del self.kernel_name, self.source_code, self.future
+        return kernel
+
+
+# If this process dies abnormally (e.g. segfault)
+# it will not shut down the workers. Instead
+# the workers will have their parent reassigned to the
+# init process. This launches a separate thread to
+# watch for the worker getting reassigned,
+# and cleans it up in this case.
+#
+# This function cannot be an inner function since otherwise mp_context="spawn" would
+# not work for ProcessPoolExecutor since inner functions cannot be pickled.
+def _async_compile_initializer(orig_ppid) -> None:
+    def run() -> None:
+        while True:
+            sleep(1)
+            if orig_ppid != os.getppid():
+                os.kill(os.getpid(), signal.SIGKILL)
+
+    global _watchdog_thread
+    _watchdog_thread = Thread(target=run, daemon=True)
+    _watchdog_thread.start()
+    # Ignore Ctrl-C (i.e. SIGINT) sent to pool workers to avoid meaningless log spam.
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
+_watchdog_thread: Optional[Thread] = None
+
+# Used to keep track of all process pools invoked so far.
+_pool_set: Set[ProcessPoolExecutor] = set()
+
+
+def shutdown_compile_workers() -> None:
+    """Shut down all outstanding compile-worker pools."""
+    global _pool_set
+    for pool in _pool_set:
+        pool.shutdown()
+    _pool_set.clear()
+
+
+class AsyncCompile:
+    def __init__(self) -> None:
+        pass
+
+    @staticmethod
+    @functools.lru_cache(1)
+    def pool() -> ThreadPoolExecutor:
+        assert config.compile_threads > 1
+        return ThreadPoolExecutor(config.compile_threads)
+
+    @staticmethod
+    @functools.lru_cache(1)
+    def process_pool() -> ProcessPoolExecutor:
+        # ensure properties have been calculated before processes
+        # are forked
+        caching_device_properties()
+        assert config.compile_threads > 1
+        orig_ppid = os.getpid()
+
+        ctx = multiprocessing.get_context(config.worker_start_method)
+        pool = ProcessPoolExecutor(
+            config.compile_threads,
+            mp_context=ctx,
+            initializer=partial(_async_compile_initializer, orig_ppid),
+        )
+
+        global _pool_set
+        _pool_set.add(pool)
+
+        # when this pool is created in a subprocess object, the normal exit handler
+        # doesn't run, and we need to register our own handler.
+        # exitpriority has to be high, because another one of the finalizers will
+        # kill the worker thread that sends the shutdown message to the workers...
+        multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
+        return pool
+
+    @classmethod
+    def warm_pool(cls) -> None:
+        if config.compile_threads <= 1:
+            return
+        _compile_start()
+        pool = cls.process_pool()
+
+        # We have to fork processes for compiler workers, but the more memory and other resources that are loaded, the
+        # slower the os.fork time is, quite drastically. It also holds the GIL so we can't put it on another thread.
+
+        # Examples:
+        # A simple x + x + x script: 10ms seconds in the middle of the program, 2ms at startup
+        # tf_efficientnet_b0 benchmark: 50ms! in the middle of the program , 3ms at startup
+
+        # So we want to start the workers early when it is still cheap, and also to allow the workers to get
+        # ready before we have work for them.
+
+        # ProcessPoolExecutor also does not launch the workers until it finds a point when all the workers are idle.
+        # But if we waited until then fork time will be long and we will be waiting for the processes to initialize.
+
+        # We force them to start here with some YOLOing of the internal methods.
+        if hasattr(pool, "_start_queue_management_thread"):
+            pool._start_queue_management_thread()
+        else:
+            for _ in range(config.compile_threads):
+                pool._adjust_process_count()
+            if hasattr(pool, "_start_executor_manager_thread"):
+                pool._start_executor_manager_thread()
+        _compile_end()
+
+    @classmethod
+    def submit(cls, task: Callable[..., Any]) -> Any:
+        if config.compile_threads <= 1:
+            return task()
+        return cls.pool().submit(task)
+
+    @classmethod
+    def map(cls, fn: Callable[..., Any], seq: List[Any]) -> List[Any]:
+        if config.compile_threads <= 1 or len(seq) <= 1:
+            return list(map(fn, seq))
+        return [t.result() for t in [cls.pool().submit(fn, x) for x in seq]]
+
+    def triton(
+        self, kernel_name: str, source_code: str, device_str: str = "cuda"
+    ) -> Union[TritonFuture, ModuleType]:
+        _compile_start()
+
+        if config.compile_threads > 1:
+            device_interface = get_interface_for_device(device_str)
+            device = torch.device(device_str, device_interface.current_device())
+            cc = device_interface.get_compute_capability(device)
+            future = self.process_pool().submit(
+                _worker_compile, kernel_name, source_code, cc, device
+            )
+            return TritonFuture(kernel_name, source_code, future)
+        else:
+            return _load_kernel(kernel_name, source_code)
+
+    def multi_kernel(self, *args, **kwargs) -> ModuleType:
+        """
+        Async compile the python shim for multi-kernel.
+        """
+
+        def task():
+            from torch._inductor.codegen.multi_kernel import MultiKernelCall
+
+            return MultiKernelCall(*args, **kwargs)
+
+        return self.submit(task)
+
+    def cpp(self, source_code: str) -> ModuleType:
+        def task():
+            return CppCodeCache.load(source_code).kernel
+
+        return self.submit(task)
+
+    def cpp_pybinding(self, argtypes: List[str], source_code: str) -> ModuleType:
+        return self.submit(
+            functools.partial(
+                CppPythonBindingsCodeCache.load_pybinding, argtypes, source_code
+            )
+        )
+
+    def cuda(self, source_code, dst_file_ext):
+        def task():
+            return CUDACodeCache.load(source_code, dst_file_ext)[0]
+
+        return self.submit(task)
+
+    def wait(self, scope: Dict[str, Any]) -> None:
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, TritonFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=0,
+        )
+        if config.compile_threads > 1:
+            for key, result in scope.items():
+                if config.verbose_progress and not isinstance(pbar, _Faketqdm):
+                    pbar.set_postfix_str(key)
+                if isinstance(result, (Future, TritonFuture)):
+                    scope[key] = result.result()
+                    pbar.update(1)
+
+        _compile_end()
+
+
+if os.environ.get("TORCH_TNT_IN_USE", "0") == "1":
+    # When TorchTNT is used, calling warm_pool() here will cause the
+    # compile workers created not being able to be shut down inside
+    # shutdown_compile_workers(). This may cause significant QPS drop.
+    log.info("Do not call AsyncCompile.warm_pool() because TorchTNT is in use.")
+else:
+    AsyncCompile.warm_pool()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc544bd168b6ff6333e1b64d9d69fd4834921d6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py
@@ -0,0 +1,1451 @@
+import contextlib
+import functools
+import logging
+import os
+import sys
+import time
+import warnings
+from itertools import count
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    FrozenSet,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+from unittest import mock
+
+from functorch.compile import min_cut_rematerialization_partition
+
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dynamo import (
+    compiled_autograd,
+    config as dynamo_config,
+    logging as dynamo_logging,
+    utils as dynamo_utils,
+)
+from torch._dynamo.utils import (
+    counters,
+    detect_fake_mode,
+    lazy_format_graph_code,
+    optimus_scuba_log,
+)
+from torch._functorch.aot_autograd import aot_export_module, make_boxed_func
+from torch._inductor.codecache import code_hash, CompiledFxGraph, FxGraphCache
+from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+
+from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.utils import BoxedBool, count_tangents
+from torch._logging import trace_structured
+from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._utils_internal import signpost_event
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+
+from .._dynamo.backends.common import aot_autograd
+from ..fx._lazy_graph_module import _use_lazy_graph_module  # type: ignore[attr-defined]
+from ..fx.graph import _PyTreeCodeGen
+from . import config, metrics
+from .debug import DebugContext
+from .decomposition import select_decomp_table
+from .fx_passes.joint_graph import joint_graph_passes
+from .fx_passes.post_grad import post_grad_passes, view_to_reshape
+from .fx_passes.pre_grad import pre_grad_passes
+from .graph import GraphLowering
+from .ir import ExternKernelNode
+from .utils import get_dtype_size, has_incompatible_cudagraph_ops, output_node
+from .virtualized import V
+
+if config.is_fbcode():
+    from torch._inductor.fb.utils import time_and_log
+else:
+    # no-op decorator
+    def time_and_log(attr: str, extra_loggings: Optional[Dict[str, str]] = None):
+        return dynamo_utils.identity
+
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+post_grad_graphs_log = torch._logging.getArtifactLogger(__name__, "post_grad_graphs")
+ALIGNMENT = 16
+
+
+# copy_ fails when trying to write to tensors with memory overlap,
+# for expanded dimensions (a dimension which used to have size 1 -> ?)
+# we can select one element from that dimension and write to it
+# to achieve writing to all values of that dimension of the input tensor
+def get_expanded_dims(t):
+    if not isinstance(t, torch.Tensor):
+        return None
+    return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
+
+
+def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tensor:
+    for expanded_dim in expanded_dims:
+        t = torch.ops.aten.slice(t, expanded_dim, 0, 1)
+    return t
+
+
+def complex_memory_overlap(t: torch.Tensor) -> bool:
+    # if torch._debug_has_internal_overlap thinks this tensor potentially has
+    # memory overlap internally, let's dig deeper to find out whether it's true.
+    t = index_expanded_dims(t, get_expanded_dims(t))
+    if torch._debug_has_internal_overlap(t) != 0:
+        strides = t.stride()
+        sizes = t.shape
+        indices = list(range(len(strides)))
+        indices = [x for _, x in sorted(zip(strides, indices))]
+        for i in range(len(strides)):
+            prev_stride = 1 if i == 0 else strides[indices[i - 1]]
+            prev_size = 1 if i == 0 else sizes[indices[i - 1]]
+            if strides[indices[i]] < prev_stride * prev_size:
+                return True
+    return False
+
+
+@functools.lru_cache(None)
+def _step_logger():
+    return dynamo_logging.get_step_logger(log)
+
+
+@functools.lru_cache(None)
+def _warn_tf32_disabled():
+    if (
+        torch.cuda.is_available()
+        and not torch.backends.cuda.matmul.allow_tf32
+        and torch.cuda.get_device_capability() >= (8, 0)
+    ):
+        warnings.warn(
+            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. "
+            "Consider setting `torch.set_float32_matmul_precision('high')` for better performance."
+        )
+
+
+def _unlift_graph(mod, gm, graph_signature):
+    from torch.export.unflatten import _assign_attr, _AttrKind
+
+    state_dict = {}
+    for name, param in mod.named_parameters(remove_duplicate=False):
+        state_dict[name] = param
+        _assign_attr(
+            param,
+            gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        state_dict[name] = buffer
+        _assign_attr(
+            buffer,
+            gm,
+            name,
+            attr_kind=_AttrKind.BUFFER,
+        )
+
+    placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    lifted_inputs = []
+    for node in placeholder_nodes:
+        node_name = node.name
+        if node_name in graph_signature.inputs_to_parameters:
+            lifted_inputs.append(graph_signature.inputs_to_parameters[node_name])
+        elif node_name in graph_signature.inputs_to_buffers:
+            lifted_inputs.append(graph_signature.inputs_to_buffers[node_name])
+        else:
+            assert node_name in graph_signature.user_inputs
+            lifted_inputs.append(None)
+
+    from torch.export._unlift import _unlift
+
+    outputs = list(gm.graph.nodes)[-1].args[0]
+    mutated_outputs = []
+    for out in outputs:
+        if out in graph_signature.buffers_to_mutate:
+            mutated_outputs.append(graph_signature.buffers_to_mutate[out.name])
+        else:
+            mutated_outputs.append(None)
+
+    unlifted_gm = _unlift(
+        gm,
+        lifted_inputs,
+        mutated_outputs,
+        pytree.LeafSpec(),
+        None,
+        state_dict,
+        {},
+    )
+    return unlifted_gm
+
+
+def _get_subgraph_names(gm):
+    for node in gm.graph.nodes:
+        if node.target == torch.ops.higher_order.cond:
+            true_subgraph_name = node.args[1].name
+            false_subgraph_name = node.args[2].name
+            yield true_subgraph_name
+            yield false_subgraph_name
+
+
+def _recursive_pre_grad_passes(gm, example_inputs):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        # as we don't have recursive example inputs, passing None here
+        new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None)
+        setattr(gm, subgraph_name, new_subgraph)
+    return pre_grad_passes(gm, example_inputs)
+
+
+def _recursive_joint_graph_passes(gm):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_joint_graph_passes(subgraph)
+    joint_graph_passes(gm)
+
+
+def _recursive_post_grad_passes(gm, is_inference: bool = False):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_post_grad_passes(subgraph, is_inference)
+    post_grad_passes(gm, is_inference)
+
+
+def split_const_gm(
+    gm: torch.fx.GraphModule,
+) -> Tuple[torch.fx.GraphModule, Dict[str, int]]:
+    """
+    This function takes an GraphModule input "gm".
+    The gm will be split into 2 components,
+      1) const_gm, which consists the subgraph of gm that can be constant folded.
+      2) gm (being inplace modified,) which returns the graph after constant folding.
+
+    const_output_index is a mapping of corresponding node name from gm to the
+    output index of const_gm.
+    Returns (const_gm, const_output_index)
+    """
+    from torch._inductor.constant_folding import (
+        CONST_MODULE_TAG,
+        META_TAG,
+        MODULE_TAG,
+        replace_node_with_constant,
+        run_and_get_constant_graph,
+    )
+
+    const_gm = run_and_get_constant_graph(gm)
+    const_result = const_gm()
+
+    const_outputs = {
+        x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
+    }
+
+    to_erase_node = []
+    to_replace_node = []
+    const_output_index = {}
+    for node in gm.graph.nodes:
+        if node.name in const_outputs:
+            to_replace_node.append(node)
+        elif node.meta[META_TAG] == CONST_MODULE_TAG:
+            to_erase_node.append(node)
+
+    for node in to_replace_node:
+        new_const_name = "_FOLDED_CONST_" + node.name
+        replace_node_with_constant(
+            gm,
+            node,
+            const_result[const_outputs[node.name]],
+            new_const_name,
+        )
+        const_output_index[new_const_name] = const_outputs[node.name]
+    for node in to_erase_node[::-1]:
+        if node.users:
+            for n in node.users:
+                assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty."
+        else:
+            gm.graph.erase_node(node)
+    gm.recompile()
+
+    return const_gm, const_output_index
+
+
+def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
+    aten = torch.ops.aten
+    tf32_ops = {
+        aten.mm.default,
+        aten.addmm.default,
+        aten.bmm.default,
+        aten.baddbmm.default,
+    }
+    for node in gm.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target in tf32_ops
+            and isinstance(node.meta.get("val", None), torch.Tensor)
+            and node.meta["val"].dtype == torch.float32
+            and node.meta["val"].device.type == "cuda"
+        ):
+            return True
+    return False
+
+
+@DebugContext.wrap
+def count_bytes_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    num_fixed: int = 0,
+    **kwargs,
+):
+    shape_env = _shape_env_from_inputs(example_inputs)
+    fake_mode = fake_tensor_prop(gm, example_inputs)
+
+    with V.set_fake_mode(fake_mode):
+        _recursive_post_grad_passes(gm, False)
+
+    graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
+    with V.set_graph_handler(graph), V.set_real_inputs(example_inputs):
+        graph.run(*example_inputs)
+        num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+        metrics.num_bytes_accessed += num_bytes
+        metrics.nodes_num_elem += nodes_num_elem
+        metrics.node_runtimes += node_runtimes
+    return make_boxed_func(gm.forward)
+
+
+def fake_tensor_prop(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    force_allow_non_fake_inputs: bool = False,
+):
+    """
+    If we can not detect fake mode from the context of inputs, create one.
+
+    The created fake mode will be returned.
+    """
+    fake_mode = detect_fake_mode(example_inputs)
+    if not fake_mode:
+        fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        FakeTensorProp(gm, mode=fake_mode).propagate(*example_inputs)
+    else:
+        ctx = (
+            contextlib.nullcontext()
+            if not force_allow_non_fake_inputs
+            else mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+        )
+        with ctx:  # type: ignore[attr-defined]
+            FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(
+                *example_inputs
+            )
+
+    return fake_mode
+
+
+# pass config dict back to user
+def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
+    with config.patch(config_patches):
+        return config.get_config_copy()
+
+
+@DebugContext.wrap
+@torch.utils._python_dispatch._disable_current_modes()
+@time_and_log(
+    attr="compilation time (in seconds)",
+    extra_loggings={"config_dict": str(get_patched_config_dict())},
+)
+# Need this decorator for compile_fx_inner even if we already have one for
+# compile_fx. The reason is the compilation for backward graph may happen after
+# compile_fx return and we may want to use the _LazyGraphModule for compiling
+# the backward graph as well.
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+@dynamo_utils.dynamo_timed(phase_name="inductor_compile")
+def compile_fx_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    """
+    Inductor API that compiles a single graph.
+
+    If you change the argument list for this function, make sure you
+    also update the call to save_args_for_compile_fx_inner below accordingly.
+    """
+    if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        _LazyGraphModule.force_recompile(gm)
+        return make_boxed_func(gm.forward)
+
+    assert isinstance(
+        next(iter(reversed(gm.graph.nodes))).args[0], (tuple, list)
+    ), f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}"
+
+    if config.save_args:
+        save_args_for_compile_fx_inner(
+            gm,
+            example_inputs,
+            cudagraphs=cudagraphs,
+            num_fixed=num_fixed,
+            is_backward=is_backward,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            is_inference=is_inference,
+            boxed_forward_device_index=boxed_forward_device_index,
+            user_visible_outputs=user_visible_outputs,
+            layout_opt=layout_opt,
+        )
+
+    if cudagraphs is None:
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+
+    # Inputs to fx_codegen_and_compile
+    # Anything that affects codegen should go here, so if the signature
+    # of fx_codegen_and_compile changes, the dict should be updated accordingly
+    graph_kwargs = {
+        "cudagraphs": cudagraphs,
+        "num_fixed": num_fixed,
+        "is_backward": is_backward,
+        "graph_id": graph_id,
+        "cpp_wrapper": cpp_wrapper,
+        "aot_mode": aot_mode,
+        "is_inference": is_inference,
+        "user_visible_outputs": user_visible_outputs,
+        "layout_opt": layout_opt,
+        "extern_node_serializer": extern_node_serializer,
+    }
+
+    start = time.time()
+
+    if config.fx_graph_cache and not aot_mode:
+        compiled_graph = FxGraphCache.load(
+            fx_codegen_and_compile, gm, example_inputs, graph_kwargs
+        )
+    else:
+        compiled_graph = fx_codegen_and_compile(
+            gm, example_inputs, **graph_kwargs  # type: ignore[arg-type]
+        )
+
+    log.debug("FX codegen and compilation took %.3fs", time.time() - start)
+
+    # check cudagraph disabling reasons from inductor lowering
+    if cudagraphs and compiled_graph.disabled_cudagraphs_reason:
+        perf_hint_log.warning(
+            "skipping cudagraphs due to %s", compiled_graph.disabled_cudagraphs_reason
+        )
+        BoxedBool.disable(cudagraphs)
+
+    # Return the output strides to the caller via TracingContext
+    context = torch._guards.TracingContext.try_get()
+    if context is not None and context.output_strides is not None:
+        assert len(context.output_strides) == 0
+        context.output_strides.extend(compiled_graph.output_strides)
+
+    if aot_mode:
+        return compiled_graph
+
+    if cudagraphs:
+        # output args are tuple of first argument
+        output = output_node(gm)
+        assert len(output.args) == 1
+        stack_traces = [
+            (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+            for arg in output.args[0]
+        ]
+
+        complex_memory_overlap_inputs = any(
+            complex_memory_overlap(t)
+            for t in example_inputs
+            if isinstance(t, torch.Tensor)
+        )
+
+        from torch._inductor.cudagraph_utils import check_for_mutation
+
+        has_mutation_str = check_for_mutation(gm, compiled_graph, num_fixed)
+        has_mutation = has_mutation_str is not None
+
+        if has_mutation:
+            compiled_graph.disabled_cudagraphs_reason = has_mutation_str
+
+        cudagraph_tests = [
+            (not has_mutation, "mutated inputs"),
+            (not has_incompatible_cudagraph_ops(gm), "incompatible ops"),
+            (not complex_memory_overlap_inputs, "complex memory overlap"),
+            (
+                all(
+                    isinstance(t, (torch.Tensor, torch.SymInt)) for t in example_inputs
+                ),
+                "non-Tensor inputs",
+            ),
+        ]
+        cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b]
+
+        if not cudagraph_fail_reasons:
+            if not config.triton.cudagraph_trees:
+                # Force specialize all inputs so that CUDA graphs will work
+                for t in example_inputs:
+                    if isinstance(t, torch.SymInt):
+                        int(t)  # guard
+
+            if (
+                boxed_forward_device_index is not None
+                and not is_inference
+                and not is_backward
+            ):
+                boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
+
+            compiled_graph.current_callable = cudagraphify(
+                compiled_graph.get_current_callable(),
+                example_inputs,
+                static_input_idxs=range(num_fixed),
+                device_index=next(iter(compiled_graph.device_idxs)),
+                stack_traces=stack_traces,
+                is_backward=is_backward,
+                is_inference=is_inference,
+                constants=tuple(compiled_graph.constants.values()),
+            )
+        else:
+            BoxedBool.disable(cudagraphs)
+
+            # See [Backward Generation Handling]
+            # if cudagraph'd the forward and set the device, we need to let the cudagraph manager
+            # know we are we running the backward even if we will not run it in cudagraphs
+            if is_backward and config.triton.cudagraph_trees:
+                assert boxed_forward_device_index is not None
+                assert boxed_forward_device_index.value is not None
+                compiled_graph_callable = compiled_graph.get_current_callable()
+
+                manager = torch._inductor.cudagraph_trees.get_manager(
+                    boxed_forward_device_index.value, create_if_none_exists=False
+                )
+                # should already exist from forward
+                assert manager is not None
+
+                def compiled_artifact(new_inputs):
+                    manager.set_to_running_backward()
+                    return compiled_graph_callable(new_inputs)
+
+                compiled_graph.current_callable = compiled_artifact
+
+            if "cuda" in compiled_graph.device_types:
+                # prefer better disable_cudagraphs_reason bc stack trace
+                # TODO: migrate all disable reasons to stack trace, refactor
+                if compiled_graph.disabled_cudagraphs_reason:
+                    perf_hint_log.warning(compiled_graph.disabled_cudagraphs_reason)
+                else:
+                    perf_hint_log.warning(
+                        "skipping cudagraphs due to %s", cudagraph_fail_reasons
+                    )
+
+    # cudagraphs does its own aligning of inputs
+    if not cudagraphs:
+        new_callable = align_inputs(
+            compiled_graph.get_current_callable(), example_inputs, range(num_fixed)
+        )
+        if new_callable is not compiled_graph.get_current_callable():
+            compiled_graph.current_callable = new_callable
+
+    _step_logger()(
+        logging.INFO,
+        "torchinductor done compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+
+    # aot autograd needs to know to pass in inputs as a list
+    compiled_graph._boxed_call = True
+    return compiled_graph
+
+
+def fx_codegen_and_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    if is_tf32_warning_applicable(gm):
+        _warn_tf32_disabled()
+
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000))
+
+    _step_logger()(
+        logging.INFO,
+        "torchinductor compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+    V.debug.fx_graph(gm, example_inputs)
+    # TODO: Should we actually dump this?  It should be redundant with the aot
+    # structured logs...
+    # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
+
+    shape_env = _shape_env_from_inputs(example_inputs)
+
+    # Convert view to reshape in the graph. This is necessary primarily for
+    # layout optimization. Do it unconditionally for uniformity.
+    #
+    # It's needed because when we do layout optimization, an contiguous tensor
+    # in eager mode may becomes a channels last tensor. A view op previously
+    # can be applied to the contiguous tensor may not be able to be applied
+    # on the channels tensor any more. An error like
+    #   RuntimeError: view size is not compatible with input tensor's size and stride
+    #   (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+    # will be printed.
+    #
+    # Replace view op to reshape op in this case.
+    # As an example, timm_resnest/botnet26t_256/convnext_base etc. will fail if we don't do this.
+    #
+    # Also this has to be done before FakeTensorProp below to avoid the failed
+    # .view() call.
+    view_to_reshape(gm)
+
+    # It is safe to run FakeTensorProp under no_grad because by the time
+    # we're in inductor, we assume that AOTAutograd has already "taken care"
+    # of autograd, so there should be no more autograd-related API's in the
+    # graph.
+    with torch.no_grad():
+        fake_mode = fake_tensor_prop(gm, example_inputs)
+
+    # pattern matcher passes might not preserve striding information
+    # on node.meta["val"]. if in the future we rely on these being
+    # correct we will need to fix.
+
+    with V.set_fake_mode(fake_mode):
+        # has some issues with memory in training
+        _recursive_post_grad_passes(gm, is_inference=is_inference)
+        V.debug.fx_graph_transformed(gm, example_inputs)
+        post_grad_graphs_log.debug("%s", lazy_format_graph_code("AFTER POST GRAD", gm))
+        trace_structured(
+            "inductor_post_grad_graph",
+            payload_fn=lambda: gm.print_readable(print_output=False),
+        )
+        optimus_scuba_log["inductor_post_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.post_grad_passes",
+            optimus_scuba_log,
+        )
+
+    with V.set_fake_mode(fake_mode):
+        const_output_index = None
+        const_graph = None
+        const_code = None
+
+        if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+            const_gm, const_output_index = split_const_gm(gm)
+
+            const_graph = GraphLowering(
+                const_gm,
+                example_inputs=[],
+                shape_env=shape_env,
+                num_static_inputs=num_fixed,
+                graph_id=graph_id,
+                cpp_wrapper=cpp_wrapper,
+                aot_mode=aot_mode,
+                user_visible_outputs=user_visible_outputs,
+                extern_node_serializer=extern_node_serializer,
+                is_inference=is_inference,
+                is_const_graph=True,
+            )
+            with V.set_graph_handler(const_graph):
+                assert cpp_wrapper, "AOT mode only supports C++ wrapper"
+                const_graph.run()
+
+                const_code, _ = const_graph.codegen_with_cpp_wrapper()
+
+        graph = GraphLowering(
+            gm,
+            # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
+            # For the forward pass, we have the real inputs to be used as example_inputs. For the backward pass,
+            # we currently use fake tensors and defake them later.
+            example_inputs=example_inputs,
+            shape_env=shape_env,
+            num_static_inputs=num_fixed,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            user_visible_outputs=user_visible_outputs,
+            extern_node_serializer=extern_node_serializer,
+            is_inference=is_inference,
+            const_output_index=const_output_index,
+            const_code=const_code,
+            const_module=const_graph,
+        )
+        with V.set_graph_handler(graph):
+            graph.run(*example_inputs)
+            output_strides: List[Optional[Tuple[int, ...]]] = []
+            if graph.graph_outputs is not None:
+                # We'll put the output strides in the compiled graph so we
+                # can later return them to the caller via TracingContext
+                for out in graph.graph_outputs:
+                    if hasattr(out, "layout"):
+                        output_strides.append(
+                            tuple(
+                                V.graph.sizevars.size_hint(s) for s in out.layout.stride
+                            )
+                        )
+                    else:
+                        output_strides.append(None)
+
+            metrics_helper = metrics.CachedMetricsHelper()
+            compiled_fn = graph.compile_to_fn()
+
+            if V.aot_compilation is True:
+                return compiled_fn
+
+            if cudagraphs and not V.graph.disable_cudagraphs_reason:
+                from torch._inductor.cudagraph_utils import (
+                    check_lowering_disable_cudagraph,
+                )
+
+                V.graph.disable_cudagraphs_reason = check_lowering_disable_cudagraph(
+                    V.graph.device_node_mapping
+                )
+
+            compiled_graph = CompiledFxGraph(
+                compiled_fn,
+                graph,
+                output_strides,
+                V.graph.disable_cudagraphs_reason,
+                metrics_helper.get_deltas(),
+            )
+
+    return compiled_graph
+
+
+def clone_preserve_strides(x: torch.Tensor):
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.as_strided(x, (needed_size,), (1,)).clone()
+    return torch.as_strided(buffer, x.size(), x.stride())
+
+
+def copy_misaligned_inputs(
+    new_inputs: List[torch.Tensor], check_inputs_idxs: Sequence[int]
+) -> None:
+    for i in check_inputs_idxs:
+        if new_inputs[i].data_ptr() % ALIGNMENT:
+            new_inputs[i] = clone_preserve_strides(new_inputs[i])
+
+
+def get_input_idxs_to_check(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+) -> Sequence[int]:
+    def is_aligned(storage_offset, dtype):
+        return (storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0
+
+    ids_to_check = []
+    for i, input in enumerate(inputs):
+        if (
+            isinstance(input, torch.Tensor)
+            and (
+                i not in static_input_idxs
+                or not is_aligned(input.storage_offset(), input.dtype)
+            )
+            and input.device.type == "cuda"
+        ):
+            ids_to_check.append(i)
+    return ids_to_check
+
+
+def align_inputs_from_check_idxs(
+    model: Callable[[List[torch.Tensor]], Any], inputs_to_check: Sequence[int]
+):
+    if len(inputs_to_check) == 0:
+        return model
+
+    def run(new_inputs):
+        copy_misaligned_inputs(new_inputs, inputs_to_check)
+        return model(new_inputs)
+
+    return run
+
+
+def align_inputs(
+    model: Callable[[List[torch.Tensor]], Any],
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    inputs_to_check = get_input_idxs_to_check(inputs, static_input_idxs)
+    return align_inputs_from_check_idxs(model, inputs_to_check)
+
+
+@dynamo_utils.dynamo_timed
+def cudagraphify(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+    *,
+    device_index: int,
+    stack_traces: List[Optional[str]],
+    is_backward: bool,
+    is_inference: bool,
+    constants: Tuple[torch.Tensor, ...] = (),
+):
+    from torch._inductor.cudagraph_trees import (
+        cudagraphify_impl as new_cudagraphify_impl,
+    )
+
+    cudagraphify_fn: Callable[..., Any]
+    if config.triton.cudagraph_trees:
+        cudagraphify_fn = functools.partial(
+            new_cudagraphify_impl,
+            device_index=device_index,
+            stack_traces=stack_traces,
+            is_backward=is_backward,
+            is_inference=is_inference,
+            constants=constants,
+        )
+    else:
+        cudagraphify_fn = cudagraphify_impl
+
+    # if using fake tensors, defer cudagraphs until we get real inputs at runtime
+    if not any(isinstance(inp, FakeTensor) for inp in inputs):
+        return cudagraphify_fn(model, inputs, static_input_idxs)
+
+    compiled_fn = None
+
+    def run(new_inputs):
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            with dynamo_utils.preserve_rng_state():
+                compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)
+        return compiled_fn(new_inputs)
+
+    return run
+
+
+def remove_unaligned_input_idxs(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+):
+    """
+    We require all inputs to be aligned, so introduce a copy for any
+    that aren't.
+    """
+    aligned_static_input_idxs = []
+    for idx, input in zip(static_input_idxs, inputs):
+        if isinstance(input, torch.Tensor) and (input.data_ptr() % ALIGNMENT) == 0:
+            aligned_static_input_idxs.append(idx)
+    if len(aligned_static_input_idxs) != len(static_input_idxs):
+        return aligned_static_input_idxs
+    return static_input_idxs
+
+
+def static_input(x: torch.Tensor):
+    """
+    Copy and input while preserving strides
+    """
+    # TODO(jansel): figure out why this version doesn't work:
+    # return torch.empty_strided(x.size(), x.stride(), dtype=x.dtype, device=x.device)
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.empty(needed_size, dtype=x.dtype, device=x.device)
+    return torch.as_strided(buffer, x.size(), x.stride())
+
+
+def index_expanded_dims_and_copy_(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+    expanded_dims: List[int],
+):
+    "Index into expanded dimensions of both dst and src then copy_"
+    dst = index_expanded_dims(dst, expanded_dims)
+    src = index_expanded_dims(src, expanded_dims)
+    dst.copy_(src)
+
+
+def cudagraphify_impl(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    """
+    Assumes inputs[static_input_idxs[i]] are always the same memory address
+    """
+    check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)
+    static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs)
+    copy_misaligned_inputs(inputs, check_input_idxs)
+
+    assert isinstance(inputs, list)
+
+    inps_expanded_dims = [
+        get_expanded_dims(x) if idx not in static_input_idxs else []
+        for idx, x in enumerate(inputs)
+    ]
+
+    # allocate static tensor inputs
+    static_inputs = [
+        x
+        if not isinstance(x, torch.Tensor)
+        else static_input(x)
+        if idx not in static_input_idxs
+        else x.detach()
+        for idx, x in enumerate(inputs)
+    ]
+
+    # copy over input values for fresh allocations
+    for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)):
+        if isinstance(x, torch.Tensor) and idx not in static_input_idxs:
+            index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims)
+
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    # copy static_inputs because it will be cleared in model
+    with torch.cuda.stream(stream):
+        model(list(static_inputs))
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream, capture_error_mode="thread_local"):
+        static_outputs = model(list(static_inputs))
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+
+    if config.size_asserts:
+
+        def run(new_inputs):
+            assert len(static_inputs) == len(new_inputs)
+            for idx, (dst, src, expanded_dims) in enumerate(
+                zip(static_inputs, new_inputs, inps_expanded_dims)
+            ):
+                if not isinstance(dst, torch.Tensor):
+                    pass
+                elif idx in static_input_idxs:
+                    assert dst.data_ptr() == src.data_ptr()
+                else:
+                    # TODO - could make one single op of multiple slices
+                    # and avoid dispatch.
+                    # Could also pre-index the `dst` tensors
+                    index_expanded_dims_and_copy_(dst, src, expanded_dims)
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+
+    else:
+        copy_indices = [
+            idx for idx in range(len(static_inputs)) if idx not in static_input_idxs
+        ]
+
+        def run(new_inputs):
+            for idx in copy_indices:
+                expanded_dims = inps_expanded_dims[idx]
+                index_expanded_dims_and_copy_(
+                    static_inputs[idx], new_inputs[idx], expanded_dims
+                )
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+
+    return align_inputs_from_check_idxs(run, check_input_idxs)
+
+
+def compile_fx_aot(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+):
+    config_patches: Dict[str, Any] = (
+        {"cpp_wrapper": True}
+        if config_patches is None
+        else {**config_patches, "cpp_wrapper": True}
+    )
+    if (
+        "aot_inductor.output_path" not in config_patches
+        and not config.aot_inductor.output_path
+    ):
+        config_patches = {
+            **config_patches,
+            "aot_inductor.output_path": code_hash(model_.code),
+        }
+
+    extern_node_serializer = config_patches.pop("extern_node_serializer", None)
+    with V.set_aot_compilation(True):
+        compiled_lib_path = compile_fx(
+            model_,
+            example_inputs_,
+            inner_compile=functools.partial(
+                inner_compile,
+                aot_mode=True,
+                extern_node_serializer=extern_node_serializer,
+            ),
+            config_patches=config_patches,
+        )
+        assert os.path.exists(
+            compiled_lib_path
+        ), f"AOTInductor compiled library does not exist at {compiled_lib_path}"
+        return compiled_lib_path
+
+
+_graph_counter = count(0)
+
+
+def fw_compiler_freezing(
+    aot_autograd_model: torch.fx.GraphModule,
+    aot_example_inputs: List[torch.Tensor],
+    dynamo_model: torch.fx.GraphModule,
+    num_example_inputs: int,
+    inner_compile: Callable[..., Any],
+    cudagraphs: BoxedBool,
+    graph_id: int,
+    forward_device: BoxedDeviceIndex,
+):
+    from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze
+
+    # partition_fn won't be called
+    _recursive_joint_graph_passes(aot_autograd_model)
+
+    layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True)
+    if layout_opt:
+        # make sure meta['val'] is properly setup
+        fake_tensor_prop(aot_autograd_model, aot_example_inputs, True)
+        convert_conv_weights_to_channels_last(aot_autograd_model)
+
+    opt_model, preserved_arg_indices = freeze(
+        dynamo_model,
+        aot_autograd_model,
+        aot_example_inputs,  # type: ignore[arg-type]
+    )
+
+    aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices]
+    num_fixed = len(preserved_arg_indices) - num_example_inputs
+
+    fake_mode = detect_fake_mode(aot_example_inputs)
+
+    # for freezing, all graph outputs should be user visible
+    *_, model_outputs_node = opt_model.graph.nodes
+    model_outputs = model_outputs_node.args[0]
+    user_visible_outputs = [
+        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+    ]
+
+    # constant params will be real tensors, not fake
+    tracing_context = torch._guards.TracingContext.try_get()
+    if tracing_context is not None:
+        params_flat = tracing_context.params_flat
+        assert params_flat is not None
+        for i in range(len(params_flat)):
+            if i not in preserved_arg_indices:
+                params_flat[i] = None
+
+    with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
+        optimized_function = inner_compile(
+            opt_model,
+            aot_example_inputs,
+            num_fixed=num_fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=True,
+            boxed_forward_device_index=forward_device,
+            layout_opt=layout_opt,
+            user_visible_outputs=user_visible_outputs,
+        )
+
+    # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper
+    # that drops constant-ified params
+    if V.aot_compilation is True:
+        return optimized_function
+
+    def wrapper(args):
+        args_new = [args[i] for i in preserved_arg_indices]
+        args.clear()
+        return optimized_function(args_new)
+
+    wrapper._boxed_call = True  # type: ignore[attr-defined]
+
+    return wrapper
+
+
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+    decompositions: Optional[Dict[OpOverload, Callable[..., Any]]] = None,
+):
+    """Main entrypoint to a compile given FX graph"""
+    if config_patches:
+        with config.patch(config_patches):
+            return compile_fx(
+                model_,
+                example_inputs_,
+                # need extra layer of patching as backwards is compiled out of scope
+                inner_compile=config.patch(config_patches)(inner_compile),
+                decompositions=decompositions,
+            )
+
+    if config.cpp_wrapper:
+        with config.patch(
+            {
+                "cpp_wrapper": False,
+                "triton.autotune_cublasLt": False,
+                "triton.cudagraphs": False,
+                "triton.store_cubin": True,
+            }
+        ), V.set_real_inputs(example_inputs_):
+            inputs_ = example_inputs_
+            if isinstance(model_, torch.fx.GraphModule):
+                fake_inputs = [
+                    node.meta.get("val")
+                    for node in model_.graph.nodes
+                    if node.op == "placeholder"
+                ]
+                if all(v is not None for v in fake_inputs):
+                    # Validate devices before switching to fake tensors.
+                    for idx, fi, i in zip(count(), fake_inputs, inputs_):
+                        if fi.device != i.device:
+                            raise ValueError(
+                                f"Device mismatch between fake input and example input at position #{idx}: "
+                                f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                                "make sure torch.export() and torch.aot_compile() run on the same device."
+                            )
+                    inputs_ = fake_inputs
+            return compile_fx(
+                model_,
+                inputs_,
+                inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                decompositions=decompositions,
+            )
+
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+        inner_compile=inner_compile,
+        decompositions=decompositions,
+    )
+
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    if isinstance(model_, torch.fx.GraphModule):
+        if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+            # this graph is the result of dynamo.export()
+            return handle_dynamo_export_graph(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+
+        model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+        optimus_scuba_log["inductor_pre_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.pre_grad_passes",
+            optimus_scuba_log,
+        )
+
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        return flatten_graph_inputs(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    assert not config._raise_error_for_testing
+    num_example_inputs = len(example_inputs_)
+    cudagraphs = BoxedBool(config.triton.cudagraphs)
+    forward_device = BoxedDeviceIndex(None)
+
+    graph_id = next(_graph_counter)
+
+    decompositions = (
+        decompositions if decompositions is not None else select_decomp_table()
+    )
+
+    @dynamo_utils.dynamo_timed
+    def fw_compiler_base(
+        model: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        is_inference: bool,
+    ):
+        if is_inference:
+            # partition_fn won't be called
+            _recursive_joint_graph_passes(model)
+
+        fixed = torch._inductor.utils.num_fw_fixed_arguments(
+            num_example_inputs, len(example_inputs)
+        )
+        user_visible_outputs = set()
+
+        if config.keep_output_stride:
+            *_, model_outputs_node = model.graph.nodes
+            assert model_outputs_node.op == "output"
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            num_model_outputs = len(model_outputs)
+
+            context = torch._guards.TracingContext.try_get()
+            # See Note [User Outputs in the inductor graph]
+            if context is not None and context.fw_metadata and not is_inference:
+                original_output_start_index = (
+                    context.fw_metadata.num_mutated_inp_runtime_indices
+                )
+            else:
+                original_output_start_index = 0
+
+            if isinstance(model_, torch.fx.GraphModule):
+                *_, orig_model_outputs_node = model_.graph.nodes
+                assert orig_model_outputs_node.op == "output"
+                orig_model_outputs, _ = pytree.tree_flatten(
+                    orig_model_outputs_node.args
+                )
+                num_orig_model_outputs = len(orig_model_outputs)
+            else:
+                num_orig_model_outputs = num_model_outputs
+
+            assert num_orig_model_outputs <= num_model_outputs
+
+            # Note [User Outputs in the inductor graph]
+            # We makes the following assumption
+            # For inference
+            #   len(orig_model_outputs) == len(model_outputs)
+            # For training
+            #   len(orig_model_outputs) <= len(model_outputs)
+            # During training, most of the time the model_outputs starts with
+            # original module's outputs followed by saved activations.
+            # But this can be not true if the model have inplace updated tensors.
+            # AOTAutograd will make those tensors being returned before the original
+            # module's output.
+            # To make things safe, we'll use original_output_start_index field
+            # set by AOTAutograd to decide where the original module outputs start.
+            orig_output_end_idx = original_output_start_index + num_orig_model_outputs
+            # Sanity chec: we are about to splice out the "user" outputs from the full set
+            # of "graph" outputs. Make sure we're within bounds.
+            assert orig_output_end_idx <= num_model_outputs
+
+            user_visible_outputs = {
+                n.name
+                for n in model_outputs[original_output_start_index:orig_output_end_idx]
+                if isinstance(n, torch.fx.Node)
+            }
+
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=is_inference,
+            boxed_forward_device_index=forward_device,
+            user_visible_outputs=user_visible_outputs,
+        )
+
+    fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
+
+    if config.freezing and not torch.is_grad_enabled():
+        inference_compiler = functools.partial(
+            fw_compiler_freezing,
+            dynamo_model=model_,
+            num_example_inputs=num_example_inputs,
+            inner_compile=inner_compile,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            forward_device=forward_device,
+        )
+    else:
+        inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
+
+    def partition_fn(graph, joint_inputs, **kwargs):
+        _recursive_joint_graph_passes(graph)
+        return min_cut_rematerialization_partition(
+            graph, joint_inputs, **kwargs, compiler="inductor"
+        )
+
+    @dynamo_utils.dynamo_timed
+    @dynamo_utils.maybe_cprofile
+    def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        fixed = count_tangents(model)
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            is_backward=True,
+            graph_id=graph_id,
+            boxed_forward_device_index=forward_device,
+        )
+
+    # TODO: can add logging before/after the call to create_aot_dispatcher_function
+    # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+    # once torchdynamo is merged into pytorch
+
+    fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode(
+        allow_non_fake_inputs=True
+    )
+    tracing_context = (
+        torch._guards.TracingContext.try_get()
+        or torch._guards.TracingContext(fake_mode)
+    )
+
+    if V.aot_compilation is True:
+        gm, graph_signature = aot_export_module(
+            model_, example_inputs_, trace_joint=False, decompositions=decompositions
+        )
+        unlifted_gm = _unlift_graph(model_, gm, graph_signature)
+        if "dynamo_flat_name_to_original_fqn" in model_.meta:
+            unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[
+                "dynamo_flat_name_to_original_fqn"
+            ]
+        with V.set_fake_mode(fake_mode), compiled_autograd.disable():
+            return inference_compiler(unlifted_gm, example_inputs_)
+
+    with V.set_fake_mode(fake_mode), torch._guards.tracing(
+        tracing_context
+    ), compiled_autograd.disable():
+        return aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            inference_compiler=inference_compiler,
+            decompositions=decompositions,
+            partition_fn=partition_fn,
+            keep_inference_input_mutations=True,
+        )(model_, example_inputs_)
+
+
+def _shape_env_from_inputs(inputs: List[torch.Tensor]):
+    shape_env = None
+    fake_mode = detect_fake_mode(inputs)
+
+    # TODO(voz): It would be nice to enable this assert, but there are lots of tests that
+    # pass in real inputs for now.
+    # if len(inputs) > 0:
+    # assert fake_mode is not None, breakpoint()
+
+    if fake_mode is not None:
+        return fake_mode.shape_env
+
+    # When there are no tensor inputs, get shape_env from the first SymInt.
+    for input in inputs:
+        if isinstance(input, torch.SymInt):
+            return input.node.shape_env
+
+    # TODO(voz): Should we always have one anyway?
+    return None
+
+
+def graph_returns_tuple(gm: torch.fx.GraphModule):
+    """True if a FX graph returns a tuple"""
+    if not isinstance(gm, torch.fx.GraphModule):
+        return True  # can't check this, assume true
+    (rv,) = output_node(gm).args
+    if isinstance(rv, (list, tuple)):
+        return True
+    if (
+        isinstance(rv, torch.fx.node.Node)
+        and hasattr(rv.target, "_schema")
+        and len(rv.target._schema.returns) > 1
+        and all(str(ret.type) == "Tensor" for ret in rv.target._schema.returns)
+    ):
+        # for graphs whose result is one node with multiple outputs
+        return True
+    return False
+
+
+def make_graph_return_tuple(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    Mutate gm so it returns a tuple.  This is only needed for graphs
+    not created by torchdynamo that return non-tuples.
+    """
+    node = output_node(gm)
+    (rv,) = node.args
+    rv, spec = pytree.tree_flatten(rv)
+    with gm.graph.inserting_before(node):
+        gm.graph.output(rv)
+    gm.graph.erase_node(node)
+    assert graph_returns_tuple(gm)
+
+    compiled_fn = compile_gm(gm, inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args, **kwargs):
+        return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec)
+
+    return wrapper
+
+
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate inputs so that they are flat and wrap gm such that it
+    accepts those inputs.  This is only needed for graphs not created
+    by torchdynamo that take bumpy inputs.
+    """
+    inputs, spec = pytree.tree_flatten(inputs)
+
+    class GmWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gm = gm
+
+        def forward(self, *args):
+            args: List[Any] = list(args)
+            return self.gm(*pytree.tree_unflatten(args, spec))
+
+    compiled_fn = compile_gm(GmWrapper(), inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        # note this doesn't check the spec, assuming it is the same
+        return compiled_fn(*pytree.arg_tree_leaves(*args))
+
+    return wrapper
+
+
+def handle_dynamo_export_graph(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
+    convert that to a normal FX graph so inductor can compile it.
+    """
+    codegen = gm.graph._codegen
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.recompile()
+
+    compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs))
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
+
+    return wrapper
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ada6547b7b00f056775cd9551ea497960b9847
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py
@@ -0,0 +1,105 @@
+import dataclasses
+from typing import Dict, Iterable, Optional
+
+import torch
+from torch._inductor.codecache import CompiledFxGraph
+
+
+def get_mutating_use_stack_trace(placeholder_node: torch.fx.Node) -> Optional[str]:
+    # reinplaced uses might have a single, non-copy_ use
+    if len(placeholder_node.users) == 1:
+        return next(iter(placeholder_node.users)).meta.get("stack_trace", None)
+
+    for use in placeholder_node.users:
+        if use.target == torch.ops.aten.copy_.default:
+            if stack_trace := use.meta.get("stack_trace", None):
+                return stack_trace
+
+    return None
+
+
+def format_default_skip_message(reason: str) -> str:
+    return f"skipping cudagraphs due to {reason}"
+
+
+def get_mutation_stack_trace(
+    gm: torch.fx.GraphModule, mutation_indices: Iterable[int]
+) -> str:
+    stack_trace: Optional[str] = ""
+    placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+
+    for idx in mutation_indices:
+        placeholder = placeholders[idx]
+        if stack_trace := get_mutating_use_stack_trace(placeholder):
+            break
+
+    if stack_trace:
+        msg = f"skipping cudagraphs due to mutation on input. Found from : \n {stack_trace}"
+        return msg
+
+    return format_default_skip_message("mutated inputs")
+
+
+def check_for_mutation(
+    gm: torch.fx.GraphModule, compiled_graph: CompiledFxGraph, num_fixed: int
+) -> Optional[str]:
+    default_msg = format_default_skip_message("mutated inputs")
+
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx for idx in compiled_graph.mutated_input_idxs if idx >= num_fixed
+        ]
+        has_mutation = len(mutation_indices) != 0
+        if not has_mutation:
+            return None
+
+        return get_mutation_stack_trace(gm, mutation_indices)
+
+    else:
+        has_mutation = len(compiled_graph.mutated_inputs) != 0
+        return None if not has_mutation else default_msg
+
+
+def get_use_stack_trace(node) -> Optional[str]:
+    for use in node.users:
+        if stack_trace := use.meta.get("stack_trace", None):
+            return stack_trace
+    return None
+
+
+def check_multiple_devices_or_any_cpu_nodes(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+) -> Optional[str]:
+    if cpu_node := device_node_mapping.get(torch.device("cpu")):
+        if stack_trace := get_use_stack_trace(cpu_node):
+            return format_default_skip_message(
+                f"cpu device. Found from : \n {stack_trace}"
+            )
+
+        return format_default_skip_message("cpu device")
+
+    if (
+        len(device_node_mapping) == 1
+        and next(iter(device_node_mapping.keys())).type == "cuda"
+    ):
+        return None
+
+    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
+
+
+def check_lowering_disable_cudagraph(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+):
+    return check_multiple_devices_or_any_cpu_nodes(device_node_mapping)
+
+
+@dataclasses.dataclass
+class BoxedDeviceIndex:
+    value: Optional[int]
+
+    def set(self, device_idx: Optional[int]):
+        assert device_idx is None or isinstance(device_idx, int)
+        self.value = device_idx
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d6b19f62d9f5b67aa2ab82606ead1b42bc9595
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py
@@ -0,0 +1,506 @@
+import collections
+import dataclasses
+import itertools
+import logging
+import re
+import typing
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
+
+import sympy
+
+import torch
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+from .codegen.common import index_prevent_reordering
+from .utils import (
+    get_dtype_size,
+    reduction_num_outputs,
+    sympy_index_symbol,
+    sympy_str,
+    sympy_subs,
+    VarRanges,
+)
+from .virtualized import OpsHandler, ReductionType, V
+
+log = logging.getLogger(__name__)
+is_indirect = re.compile(r"indirect|tmp").search
+Dep = Union["MemoryDep", "StarDep", "WeakDep"]
+
+
+class MemoryDep(typing.NamedTuple):
+    name: str
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+
+    def __repr__(self):
+        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})"
+
+    @property
+    def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]:
+        """{c0: 128, c1: 512, ...}"""
+        return dict(zip(self.var_names, self.size))
+
+    def get_numel(self) -> sympy.Expr:
+        if self.is_indirect():
+            numel = V.graph.get_numel(self.name)
+        else:
+            vars = set(self.index.free_symbols)
+            numel = sympy.Integer(1)
+            for var, size in zip(self.var_names, self.size):
+                if var in vars:
+                    numel = numel * size
+        return numel
+
+    def rename(self, renames: Dict[str, str]) -> "MemoryDep":
+        if self.name in renames:
+            return MemoryDep(
+                renames[self.name], self.index, var_names=self.var_names, size=self.size
+            )
+        return self
+
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+
+    def is_contiguous(self) -> bool:
+        return isinstance(self.index, sympy.Symbol) and self.index in self.var_names
+
+    def is_scalar(self) -> bool:
+        if isinstance(self.index, sympy.Symbol):
+            return self.index not in self.var_names and not self.is_indirect()
+        return isinstance(self.index, (int, sympy.Integer))
+
+    def is_indirect(self) -> bool:
+        return any(is_indirect(v.name) for v in self.index.free_symbols)  # type: ignore[attr-defined]
+
+
+class StarDep(typing.NamedTuple):
+    # depends on the entire buffer
+    name: str
+
+    @property
+    def index(self):
+        raise NotImplementedError("StarDep does not have an index")
+
+    def get_numel(self) -> sympy.Expr:
+        return V.graph.get_numel(self.name)
+
+    def rename(self, renames: Dict[str, str]) -> "StarDep":
+        if self.name in renames:
+            return StarDep(renames[self.name])
+        return self
+
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+
+    def is_contiguous(self) -> bool:
+        return False
+
+    def is_scalar(self) -> bool:
+        return False
+
+    def is_indirect(self) -> bool:
+        return False
+
+
+# Used for tracking mutation ordering
+# if A reads a buffer and B mutates it
+# B must be ordered after A
+#
+# It is weak because if it turns out A's read is never used, we can still
+# eliminate it
+class WeakDep(typing.NamedTuple):
+    name: str
+
+    @property
+    def index(self):
+        raise NotImplementedError("WeakDep does not have an index")
+
+    def get_numel(self) -> sympy.Expr:
+        return sympy.Integer(1)
+
+    def rename(self, renames: Dict[str, str]) -> "WeakDep":
+        if self.name in renames:
+            return WeakDep(renames[self.name])
+        return self
+
+    def numbytes_hint(self):
+        return 1  # Purely inserted for ordering, not an actual dep
+
+    def has_unbacked_symbols(self):
+        return False
+
+    def is_contiguous(self) -> bool:
+        return False
+
+
+class IndexExprDep(typing.NamedTuple):
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+
+
+@dataclasses.dataclass
+class ReadWrites:
+    reads: Set[Dep]
+    writes: Set[Dep]
+    index_exprs: Set[IndexExprDep]
+    range_vars: Optional[List[sympy.Expr]] = None
+    var_ranges: Optional[VarRanges] = None
+    op_counts: typing.Counter[str] = dataclasses.field(
+        default_factory=collections.Counter
+    )
+
+    def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites":
+        return ReadWrites(
+            {dep.rename(renames) for dep in self.reads},
+            {dep.rename(renames) for dep in self.writes},
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def with_read(self, dep: Dep) -> "ReadWrites":
+        assert isinstance(dep, (WeakDep, StarDep))
+        return ReadWrites(
+            set.union(self.reads, {dep}),
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def merge(self, other: "ReadWrites"):
+        reads = set.union(self.reads, other.reads)
+        writes = set.union(self.writes, other.writes)
+        index_exprs = set.union(self.index_exprs, other.index_exprs)
+        op_counts = collections.Counter(self.op_counts)
+        op_counts.update(other.op_counts)
+        return ReadWrites(reads - writes, writes, index_exprs, op_counts=op_counts)
+
+    @staticmethod
+    def merge_list(read_writes: List["ReadWrites"]):
+        all_writes = set.union(*[rw.writes for rw in read_writes])
+        all_reads = set.union(*[rw.reads for rw in read_writes]) - all_writes
+        all_index_exprs = set.union(*[rw.index_exprs for rw in read_writes])
+
+        op_counts: typing.Counter[Any] = collections.Counter()
+        for rw in read_writes:
+            op_counts.update(rw.op_counts)
+
+        return ReadWrites(all_reads, all_writes, all_index_exprs, op_counts=op_counts)
+
+    def remove_reads(self, rem_reads):
+        return ReadWrites(
+            self.reads - rem_reads,
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def reads_and_writes(self):
+        return itertools.chain(self.reads, self.writes)
+
+
+class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        super().__init__()
+        self._reads: Set[Dep] = set()
+        self._writes: Set[MemoryDep] = set()
+        self._index_exprs: Set[IndexExprDep] = set()
+        self._var_ranges: VarRanges = var_ranges
+        self._normalize: bool = normalize
+
+    def canonicalize(
+        self, index: sympy.Expr
+    ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]:
+        if not self._normalize:
+            sizes = [V.graph.sizevars.simplify(x) for x in self._var_ranges.values()]
+            var_names = tuple(
+                k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1
+            )
+            sizes = tuple(v for v in sizes if v != 1)
+            return index, var_names, sizes  # type: ignore[return-value]
+
+        # Try to further simplify the indexes even if simplify_loops didn't
+        # convert it to the simplest form because of the interference from
+        # different indexing formulas.
+        free_symbols = index.free_symbols
+        var_ranges = {
+            k: V.graph.sizevars.simplify(v)
+            for k, v in self._var_ranges.items()
+            # TODO(jansel): explore this further normalization
+            # if k in free_symbols
+        }
+        index_vars = [*var_ranges.keys()]
+        sizes = tuple(var_ranges.values())
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars,
+            sizes,
+            index_prevent_reordering([index], index_vars, sizes),
+        )
+
+        # assign new variables each dimension to deal with numbering mismatches
+        # d0, d1, d2 could become d0, d2 -- which won't match d0, d1
+        new_vars, add_var = var_builder(canonicalization_prefix())
+        replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
+        index = sympy_subs(sympy.expand(index), replacement)
+
+        new_vars = [*new_vars.keys()]
+        new_sizes = [*new_sizes]
+        free_symbols = index.free_symbols
+        while new_vars and new_vars[-1] not in free_symbols:
+            # Reduction has last (reduced) dim in its sizes, but
+            # downstream users won't.  Normalize this away.
+            new_vars.pop()
+            new_sizes.pop()
+        return index, tuple(new_vars), tuple(new_sizes)  # type: ignore[arg-type]
+
+    def load(self, name: str, index: sympy.Expr) -> str:
+        self._reads.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"load({name}, {sympy_str(index)})"
+
+    def load_seed(self, name: str, index: int):
+        assert isinstance(index, int)
+        return self.load(name, sympy.Integer(index))
+
+    def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str:
+        self._writes.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
+
+    def store_reduction(self, name: str, index, value) -> str:
+        return self.store(name, index, f"store_reduction({value})")
+
+    def index_expr(self, index: sympy.Expr, dtype) -> str:
+        self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+        return f"index_expr({sympy_str(index)}, {dtype})"
+
+    def bucketize(
+        self,
+        values,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ):
+        self._reads.add(StarDep(offsets_name))
+        return f"bucketize({values}, {offsets_name}, {sympy_str(offsets_size)}, {indexing_dtype}, {right})"
+
+
+class _OpCounter:
+    """Shim to count how many times each op is used"""
+
+    def __init__(self, inner):
+        super().__init__()
+        self.parent_handler = inner
+        self._op_counts: typing.Counter[Any] = collections.Counter()
+
+    def __getattr__(self, name):
+        self._op_counts[name] += 1
+        return getattr(self.parent_handler, name)
+
+
+class RecordLoadStore(V.KernelFormatterHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        parent_handler = _RecordLoadStoreInner(
+            var_ranges=var_ranges, normalize=normalize
+        )
+        parent_handler = _OpCounter(parent_handler)
+        super().__init__(parent_handler=parent_handler)
+
+
+def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
+    cnt = itertools.count()
+    var_ranges: VarRanges = dict()
+
+    def add_var(length: sympy.Expr) -> sympy.Symbol:
+        v = sympy_index_symbol(f"{prefix}{next(cnt)}")
+        var_ranges[v] = length
+        return v
+
+    return var_ranges, add_var
+
+
+def index_vars_no_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str):
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Symbol]] = []
+    for size in argsizes:
+        args.append(list(map(add_var, size)))
+    return args, var_ranges
+
+
+def index_vars_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str = "d"):
+    from .ir import SqueezeView
+
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Expr]] = []
+    new_sizes: List[List[sympy.Expr]] = []
+    for size in argsizes:
+        new_size, reindex = SqueezeView.squeezer(size)
+        new_sizes.append(new_size)
+        args.append(reindex(list(map(add_var, new_size))))
+    return args, var_ranges
+
+
+def extract_read_writes(
+    fn: Callable[..., Any],
+    *argsizes: Tuple[sympy.Expr, ...],
+    normalize: bool = False,
+    prefix: str = "d",
+):
+    args, var_ranges = index_vars_squeeze(*argsizes, prefix=prefix)
+    rw = RecordLoadStore(var_ranges, normalize=normalize)
+    with V.set_ops_handler(rw):
+        fn(*args)
+
+    if normalize:
+        range_vars = []  # Number of vars could differ due to normalization
+    else:
+        range_vars = list(itertools.chain.from_iterable(args))
+
+    inner = rw.parent_handler.parent_handler
+    return ReadWrites(
+        set(inner._reads),
+        set(inner._writes),
+        inner._index_exprs,
+        range_vars,
+        var_ranges,
+        rw.parent_handler._op_counts,
+    )
+
+
+def extract_input_node_reduction_ranges(
+    input_node: "torch._inductor.ir.TensorBox",
+) -> Tuple[Optional[List[sympy.Expr]], Optional[List[sympy.Expr]]]:
+    """
+    Returns the size and reduction size of all inputs, if the sizes and reduction_sizes (if exist) are all the same.
+    It's possible that a node has multiple inputs, some are Reduction nodes and others are Pointwise nodes.
+    In this case, reduction_sizes of the Reduction nodes need to be the same.
+    Otherwise returns (None, None).
+    """
+
+    from .ir import ComputedBuffer, Loops
+
+    if isinstance(input_node.data, ComputedBuffer):
+        # Input node has already been realized. Return its size and reduction_size.
+        size = input_node.get_size()
+        reduction_size = input_node.get_reduction_size()
+        if len(reduction_size) > 0:
+            return (size, reduction_size)
+        else:
+            return (None, None)
+
+    if not isinstance(input_node.data.data, Loops):  # type: ignore[attr-defined]
+        # Other IRNodes do not have reduction_ranges.
+        return (None, None)
+
+    # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes?
+    # The current method still uses reduction ranges from the dependent realized node, which is not ideal.
+    # Is there a way to check whether there are permutations inbetween?
+    reads = input_node.get_reads()
+    reduction_size = None
+    size = None
+    while reduction_size is None and len(reads) > 0:
+        seen = set()
+        new_reads = []
+        for read in reads:
+            if not isinstance(read, MemoryDep):
+                continue
+            if read.name in seen:
+                continue
+            seen.add(read.name)
+            buffer = V.graph.get_buffer(read.name)
+            if buffer is None:
+                continue
+            if (
+                isinstance(buffer, ComputedBuffer)
+                and len(buffer.get_reduction_size()) > 0
+            ):
+                if reduction_size is None:
+                    reduction_size = buffer.get_reduction_size()
+                    size = buffer.get_size()
+                elif (
+                    reduction_size != buffer.get_reduction_size()
+                    or size != buffer.get_size()
+                ):
+                    return (None, None)
+            else:
+                new_reads.extend(buffer.get_reads())
+        if reads == new_reads:
+            return (size, reduction_size)
+        else:
+            reads = new_reads
+    return (size, reduction_size)
+
+
+def canonicalization_prefix():
+    return "c"
+
+
+# ops handler which computes all the free unbacked symbols for an IR
+class FreeUnbackedSymbolsOpsHandler:
+    symbols: Set[sympy.Symbol]
+
+    def __init__(self):
+        self.symbols = set()
+
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            for a in itertools.chain(args, kwargs.values()):
+                if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+                    self.symbols |= free_unbacked_symbols(a)
+
+        return inner
+
+    def indirect_indexing(self, index_var, size, check=True) -> sympy.Symbol:
+        assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+        self.symbols |= free_unbacked_symbols(size)
+        return sympy_index_symbol(f"({str(index_var)})")
+
+    def frexp(self, x):
+        return (None,) * 2
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[None, Tuple[None, ...]],
+    ) -> Union[None, Tuple[None, ...]]:
+        num_values = reduction_num_outputs(reduction_type)
+        return (None,) * num_values if num_values > 1 else None
+
+
+def _typecheck_FreeUnbackedSymbolsOpsHandler(
+    h: FreeUnbackedSymbolsOpsHandler,
+) -> OpsHandler[None]:
+    return h
+
+
+def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None):
+    from .ir import FlexibleLayout
+
+    args = [index, rindex] if rindex is not None else [index]
+    handler = FreeUnbackedSymbolsOpsHandler()
+    # NB: I cargo culted the allow_indexing patch here, I don't understand why
+    # people do this all over
+    with V.set_ops_handler(handler), patch.object(
+        FlexibleLayout, "allow_indexing", True
+    ):
+        fn(*args)
+    return handler.symbols
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9076e1c1808bb4d9a5a37b2774fbd9d839d36bd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import os
+import tempfile
+import textwrap
+from functools import lru_cache
+
+if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1":
+
+    @lru_cache(None)
+    def _record_missing_op(target):
+        with open(f"{tempfile.gettempdir()}/missing_ops.txt", "a") as fd:
+            fd.write(str(target) + "\n")
+
+else:
+
+    def _record_missing_op(target):  # type: ignore[misc]
+        pass
+
+
+class OperatorIssue(RuntimeError):
+    @staticmethod
+    def operator_str(target, args, kwargs):
+        lines = [f"target: {target}"] + [
+            f"args[{i}]: {arg}" for i, arg in enumerate(args)
+        ]
+        if kwargs:
+            lines.append(f"kwargs: {kwargs}")
+        return textwrap.indent("\n".join(lines), "  ")
+
+
+class MissingOperatorWithoutDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(f"missing lowering\n{self.operator_str(target, args, kwargs)}")
+
+
+class MissingOperatorWithDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(
+            f"missing decomposition\n{self.operator_str(target, args, kwargs)}"
+            + textwrap.dedent(
+                f"""
+
+                There is a decomposition available for {target} in
+                torch._decomp.get_decompositions().  Please add this operator to the
+                `decompositions` list in torch._inductor.decompositions
+                """
+            )
+        )
+
+
+class LoweringException(OperatorIssue):
+    def __init__(self, exc: Exception, target, args, kwargs):
+        super().__init__(
+            f"{type(exc).__name__}: {exc}\n{self.operator_str(target, args, kwargs)}"
+        )
+
+
+class InvalidCxxCompiler(RuntimeError):
+    def __init__(self):
+        from . import config
+
+        super().__init__(
+            f"No working C++ compiler found in {config.__name__}.cpp.cxx: {config.cpp.cxx}"
+        )
+
+
+class CppWrapperCodeGenError(RuntimeError):
+    def __init__(self, msg: str):
+        super().__init__(f"C++ wrapper codegen error: {msg}")
+
+
+class CppCompileError(RuntimeError):
+    def __init__(self, cmd: list[str], output: str):
+        if isinstance(output, bytes):
+            output = output.decode("utf-8")
+
+        super().__init__(
+            textwrap.dedent(
+                """
+                    C++ compile error
+
+                    Command:
+                    {cmd}
+
+                    Output:
+                    {output}
+                """
+            )
+            .strip()
+            .format(cmd=" ".join(cmd), output=output)
+        )
+
+
+class CUDACompileError(CppCompileError):
+    pass
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cff2844855f9ba1d2328f7036aca61f68da96c5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py
@@ -0,0 +1,220 @@
+import operator
+from collections import defaultdict
+from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type
+
+import torch
+import torch.fx
+from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map
+from .virtualized import V
+
+
+# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched.
+# Works for length 2 patterns with 1 module and 1 function/method.
+def matches_module_function_pattern(
+    pattern: Tuple[Type[torch.nn.modules.Module], Callable[..., Any]],
+    node: torch.fx.node.Node,
+    modules: Dict[str, torch.nn.modules.Module],
+) -> bool:
+    if len(node.args) == 0:
+        return False
+    if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+        node, torch.fx.Node
+    ):
+        return False
+    # the first node is call_module
+    if node.args[0].op != "call_module":
+        return False
+    if not isinstance(node.args[0].target, str):
+        return False
+    if node.args[0].target not in modules:
+        return False
+    if type(modules[node.args[0].target]) is not pattern[0]:
+        return False
+    # the second node is call_function or call_method
+    if node.op != "call_function" and node.op != "call_method":
+        return False
+    if node.target != pattern[1]:
+        return False
+    # make sure node.args[0] output is only used by current node.
+    if len(node.args[0].users) > 1:
+        return False
+    return True
+
+
+class FakeTensorUpdater:
+    """
+    The main idea here is that it's difficult to maintain accurate fake
+    tensors (our primary form of metadata) for each node in our graph as we
+    transform it.
+
+    The most reliable way to obtain this information is by rerunning
+    faketensor propagation. However, in general, faketensor propagation is
+    fairly expensive. So, instead we'd like to only rerun faketensor
+    propagation on nodes that have changed.
+
+    In order to detect which nodes have changed, we first hash its node,
+    target, and argument lists (which are immutable in FX).
+
+    Then, whenever we call incremental_update, we check which FX nodes have a
+    new hash, and recompute the faketensor metadata for that node. Then, we
+    continue to recursively compute the faketensors for all users until the
+    fake tensors stop changing.
+    """
+
+    def __init__(self, graph: torch.fx.Graph):
+        self.processed_hashes = set()
+        self.graph = graph
+
+        for node in self.graph.nodes:
+            self.processed_hashes.add(self.hash_node(node))
+
+    def hash_node(self, node: torch.fx.Node):
+        # todo(chilli): Not a great hash function
+        return (node, node.target, id(node.args), id(node.kwargs))
+
+    def incremental_update(self):
+        processed = set()
+        existing_storages: DefaultDict[Optional[int], int] = defaultdict(int)
+        for node in self.graph.nodes:
+            existing_storages[get_node_storage(node)] += 1
+
+        def is_intlist_same(new, old):
+            return statically_known_true(sym_eq(new, old))
+
+        def is_fake_tensor_same(new, old):
+            if type(new) != type(old):
+                return False
+            if isinstance(new, (list, tuple)):
+                if len(new) != len(old):
+                    return False
+                return all(
+                    is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
+                )
+            assert isinstance(new, torch.Tensor)
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
+                return False
+
+            if get_storage(new) == get_storage(old):
+                return True
+
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+            ):
+                return True
+            return False
+
+        for node in self.graph.nodes:
+            if self.hash_node(node) in self.processed_hashes:
+                continue
+
+            def is_aten_node(node):
+                return node.op == "call_function" and isinstance(
+                    node.target, torch._ops.OpOverload
+                )
+
+            if not is_aten_node(node):
+                continue
+
+            processing = [node]
+            while len(processing) > 0:
+                updating_node = processing.pop()
+                if updating_node in processed:
+                    continue
+                if is_aten_node(updating_node):
+                    continue
+
+                is_valid, args, kwargs = get_fake_args_kwargs(updating_node)
+                if not is_valid:
+                    continue
+                with V.fake_mode:
+                    new_fake_tensor = updating_node.target(*args, **kwargs)
+                if "val" in updating_node.meta and is_fake_tensor_same(
+                    new_fake_tensor, updating_node.meta["val"]
+                ):
+                    continue
+                updating_node.meta["val"] = new_fake_tensor
+
+                # todo(chilli): This code path is not exercised by our existing
+                # tests - add a test
+                existing_storages[get_node_storage(new_fake_tensor)] += 1
+                processed.add(updating_node)
+                processing.extend(updating_node.users)
+
+                self.processed_hashes.add(self.hash_node(updating_node))
+
+
+def get_storage(t: torch.Tensor) -> int:
+    return t.untyped_storage()._cdata
+
+
+def get_node_storage(node: torch.fx.Node) -> Optional[int]:
+    if "val" not in node.meta:
+        return None
+    if not isinstance(node.meta["val"], torch.Tensor):
+        return None
+    if not torch._C._has_storage(node.meta["val"]):
+        return None
+    return get_storage(node.meta["val"])
+
+
+def get_fake(x):
+    if isinstance(x, torch.fx.Node):
+        if "val" not in x.meta:
+            return x
+        return x.meta["val"]
+    return x
+
+
+def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str, Any]]:
+    """
+    First value returns a boolean if any of the input nodes don't have a faketensor.
+    """
+    args, kwargs = tree_map(get_fake, (x.args, x.kwargs))
+    if any(
+        isinstance(a, torch.fx.Node) for a in pytree.arg_tree_leaves(*args, **kwargs)
+    ):
+        return False, args, kwargs
+    return True, args, kwargs
+
+
+def is_node_realized(node: torch.fx.Node) -> bool:
+    """Returns true if a node is always realized when lowered to inductor IR.
+
+    NOTE: This may return some false negatives. e.g. it doesn't
+    handle buffers realized heuristically during lowering, or
+    buffers realized indirectly through view ops.
+    """
+    from torch._inductor.lowering import fallbacks, needs_realized_inputs
+
+    def is_buffer(node: torch.fx.Node) -> bool:
+        if node.op == "call_function" and node.target is operator.getitem:
+            # For nodes with multiple outputs, we get the fx graph:
+            #     foo = torch.ops.aten.foo(...)
+            #     getitem = foo[0]
+            #     getitem_1 = foo[1]
+            # where we need to check if foo is a fallback kernel
+            return is_buffer(node.args[0])  # type: ignore[arg-type]
+        return node.op in ("placeholder", "output") or node.target in fallbacks
+
+    if is_buffer(node):
+        return True
+
+    def realizes_inputs(node: torch.fx.Node) -> bool:
+        return node.op == "output" or node.target in needs_realized_inputs
+
+    if any(realizes_inputs(user) for user in node.users):
+        return True
+
+    # Otherwise, assume node isn't realized
+    return False
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a02b670982b6157a6989e1b8252c7c2cdc59087
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py
@@ -0,0 +1,277 @@
+"""This file implements the IndexPropagation ops handler, which wraps an
+underlying handler to add a limited form of constant propagation, as well as
+propagation of sympy expressions downstream of ops.index_expr calls.
+
+For example, say we have the IR:
+
+   tmp0 = ops.index_expr(x, torch.int32)
+   tmp1 = ops.constant(2, torch.int32)
+   tmp2 = ops.mul(tmp0, tmp1)
+   tmp3 = ops.indirect_indexing(tmp2, x_size)
+   tmp4 = ops.load("buf0", tmp3)
+
+The underlying handler would just see:
+
+   ops.load("buf0", x * 2)
+
+This is limited by the set of operators handled in the sympy expression
+printers. So simple operations like minimum and maximum cannot be translated to
+SymPy expressions yet, despite sympy.Min and sympy.Max existing.
+
+"""
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Literal, Optional, overload, Tuple, Union
+
+import sympy
+
+from typing_extensions import TypeAlias
+
+import torch
+from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where
+
+
+@dataclass
+class TypedExpr:
+    """A SymPy expression with associated type"""
+
+    expr: sympy.Expr
+    dtype: torch.dtype
+
+
+class SymPyOps:
+    """An ops handler where all IR values are SymPy expressions
+
+    When a value cannot be represented as a SymPy expression, the method is
+    either not defined, or returns NotImplemented
+
+    """
+
+    @staticmethod
+    def identity(value: Any) -> Any:
+        return value
+
+    @staticmethod
+    def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr:
+        if is_boolean_dtype(dtype):
+            expr = sympy.Integer(bool(value))
+        elif is_integer_dtype(dtype):
+            expr = sympy.Integer(int(value))
+        else:
+            expr = sympy.Float(float(value))
+        return TypedExpr(expr, dtype)
+
+    @staticmethod
+    def index_expr(value: sympy.Expr, dtype: torch.dtype) -> Union[int, TypedExpr]:
+        if isinstance(value, int):
+            value = sympy.Integer(value)
+        return TypedExpr(value, dtype)
+
+    @staticmethod
+    def to_dtype(
+        value: Any, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> Union[int, TypedExpr]:
+        if isinstance(value.expr, (sympy.Integer, sympy.Float)):
+            return SymPyOps.constant(value.expr, dtype)
+        elif is_integer_dtype(dtype) and is_integer_dtype(value.dtype):
+            return SymPyOps.index_expr(value.expr, dtype)
+        else:
+            # TODO: Inductor doesn't handle floating point in sympy expressions well at the moment
+            return NotImplemented
+
+    @staticmethod
+    def square(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(x.expr * x.expr, x.dtype)
+
+    @staticmethod
+    def add(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr + y.expr, result_type)
+
+    @staticmethod
+    def sub(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr - y.expr, result_type)
+
+    @staticmethod
+    def mul(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr * y.expr, result_type)
+
+    @staticmethod
+    def neg(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(-x.expr, x.dtype)
+
+    @staticmethod
+    def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+
+        return TypedExpr(FloorDiv(x.expr, y.expr), result_type)
+
+    @staticmethod
+    def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+
+        result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+        return TypedExpr(result_expr, result_type)
+
+    @staticmethod
+    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        # In these cases, remainder in Python == remainder in C++, so this transformation
+        # is sound
+        if (
+            x.expr.is_nonnegative is not None
+            and x.expr.is_nonnegative == y.expr.is_positive
+        ):
+            result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+            return TypedExpr(result_expr, result_type)
+        return NotImplemented
+
+    @staticmethod
+    def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Min(x.expr, y.expr), result_type)
+
+    @staticmethod
+    def maximum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Max(x.expr, y.expr), result_type)
+
+
+@dataclass
+class IndexPropVar:
+    value: Any  # Either an IR value, or TypedExpr if is_symbolic is true
+    is_symbolic: bool = False
+
+    @staticmethod
+    def new_symbolic(expr: TypedExpr) -> "IndexPropVar":
+        return IndexPropVar(expr, is_symbolic=True)
+
+    def __post_init__(self):
+        assert not self.is_symbolic or isinstance(
+            self.value, TypedExpr
+        ), "Symbolic IndexPropVar must contain a TypedExpr"
+
+
+IndexPropResult: TypeAlias = Union[IndexPropVar, Tuple["IndexPropResult", ...]]
+
+
+class IndexPropagation:
+    """Ops wrapper that tries to propagate constant and index_expr values through the computation.
+
+    This aims to maximize the compile time simplification possible, and convert
+    indirect indexing from arange into normal static indexing.
+
+    """
+
+    def __init__(self, inner: Any):
+        self._inner = inner
+
+    def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any:
+        # Construct a new constant/index_expr from the SymPy expression
+        if isinstance(expr, sympy.Integer):
+            return self._inner.constant(int(expr), dtype)
+        elif expr.is_number:
+            return self._inner.constant(float(expr), dtype)
+        return self._inner.index_expr(expr, dtype)
+
+    def unwrap(self, a: Union[Any, IndexPropVar]) -> Any:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.unwrap(v) for v in a)
+
+        if not isinstance(a, IndexPropVar):
+            return a
+
+        # Prefer the sympy representation if possible
+        if a.is_symbolic:
+            return self.materialize_expr(a.value.expr, a.value.dtype)
+
+        return a.value
+
+    def wrap(self, a) -> IndexPropResult:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.wrap(v) for v in a)
+        return IndexPropVar(a)
+
+    @overload
+    def fallback(
+        self,
+        name: Literal["indirect_indexing"],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> IndexPropVar:
+        ...
+
+    @overload
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        ...
+
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Fallback to the wrapped handler
+        new_args = [self.unwrap(a) for a in args]
+        new_kwargs = {k: self.unwrap(v) for k, v in kwargs.items()}
+        return self.wrap(getattr(self._inner, name)(*new_args, **new_kwargs))
+
+    def propagate_sympy(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Build a new SymPy expression from this ops call
+        def unwrap(a: Union[Any, IndexPropVar]) -> Any:
+            if not isinstance(a, IndexPropVar):
+                return a
+            return a.value
+
+        new_args = [unwrap(a) for a in args]
+        new_kwargs = {k: unwrap(v) for k, v in kwargs.items()}
+        new_expr = getattr(SymPyOps, name)(*new_args, **new_kwargs)
+        is_valid_expr = new_expr is not NotImplemented and (
+            # Inductor doesn't expect floating point in sympy expressions, but
+            # allow floating point constants to be propagated
+            isinstance(new_expr.expr, sympy.Number)
+            or new_expr.expr.is_integer
+        )
+        if not is_valid_expr:
+            return self.fallback(name, args, kwargs)
+        return IndexPropVar.new_symbolic(new_expr)
+
+    def __getattr__(self, name: str) -> Callable[..., IndexPropResult]:
+        def inner(*args: Any, **kwargs: Any) -> IndexPropResult:
+            if not hasattr(SymPyOps, name):
+                return self.fallback(name, args, kwargs)
+
+            var_arguments = [
+                a
+                for a in itertools.chain(args, kwargs.values())
+                if isinstance(a, IndexPropVar)
+            ]
+            if not all(v.is_symbolic for v in var_arguments):
+                return self.fallback(name, args, kwargs)
+
+            return self.propagate_sympy(name, args, kwargs)
+
+        return inner
+
+    def indirect_indexing(
+        self, index: Union[Any, IndexPropVar], size: Any, check: bool = True
+    ) -> Any:
+        # nb. We do index + Where(...) rather than Where(idx >= 0, idx, idx + sz) because we don't have CSE
+        #     for SymPy expressions, so we don't want to repeat idx too much
+
+        # indirect_indexing returns a sympy value, so no need to wrap in IndexPropVar here
+        if isinstance(index, IndexPropVar) and index.is_symbolic:
+            # If we are turning a indirect indexing into direct, we need to wrap it.
+            index = index.value.expr
+            return index + Where(index >= 0, 0, size)
+        return self.fallback("indirect_indexing", (index, size, check), {}).value
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a2fcca5f2fb9bbb2118b5b833be61670ee6d3d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py
@@ -0,0 +1,419 @@
+from __future__ import annotations
+
+import csv
+import inspect
+import os
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+
+from typing import Dict, List, Set, Tuple, TYPE_CHECKING, Union
+
+from torch._inductor import config
+from torch._inductor.utils import get_benchmark_name
+
+# Prevent circular import
+if TYPE_CHECKING:
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        ExternKernelSchedulerNode,
+        NopKernelSchedulerNode,
+        SchedulerNode,
+    )
+
+# counter for tracking how many kernels have been generated
+generated_kernel_count = 0
+generated_cpp_vec_kernel_count = 0
+num_bytes_accessed = 0
+nodes_num_elem: List[
+    Tuple[
+        Union[NopKernelSchedulerNode, SchedulerNode, ExternKernelSchedulerNode],
+        int,
+    ]
+] = []
+node_runtimes: List[Tuple[BaseSchedulerNode, float]] = []
+
+# counters for tracking fusions
+ir_nodes_pre_fusion = 0
+
+# counters for tracking to_dtype inserted
+cpp_to_dtype_count = 0
+
+# counters for tracking cpp_wrapper disabled
+disable_cpp_wrapper = 0
+
+
+# reset all counters
+def reset():
+    global generated_kernel_count
+    global generated_cpp_vec_kernel_count
+    global num_bytes_accessed, nodes_num_elem
+    global ir_nodes_pre_fusion
+    global cpp_to_dtype_count
+    global disable_cpp_wrapper
+
+    generated_kernel_count = 0
+    generated_cpp_vec_kernel_count = 0
+    num_bytes_accessed = 0
+    nodes_num_elem.clear()
+    node_runtimes.clear()
+    ir_nodes_pre_fusion = 0
+    cpp_to_dtype_count = 0
+    disable_cpp_wrapper = 0
+
+
+@dataclass
+class CachedMetricsDeltas:
+    """
+    The subset of metrics we want update across cache hits, e.g., the
+    FxGraphCache.
+    """
+
+    generated_kernel_count: int
+    generated_cpp_vec_kernel_count: int
+    ir_nodes_pre_fusion: int
+    cpp_to_dtype_count: int
+
+
+class CachedMetricsHelper:
+    """
+    A helper class to help calculate and apply counter deltas for those
+    metrics we want to save with cache entries (e.g., FxGraphCache) and
+    apply on a cache hit.
+    """
+
+    def __init__(self):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        self.generated_kernel_count = generated_kernel_count
+        self.generated_cpp_vec_kernel_count = generated_cpp_vec_kernel_count
+        self.ir_nodes_pre_fusion = ir_nodes_pre_fusion
+        self.cpp_to_dtype_count = cpp_to_dtype_count
+
+    def get_deltas(self) -> CachedMetricsDeltas:
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        return CachedMetricsDeltas(
+            generated_kernel_count - self.generated_kernel_count,
+            generated_cpp_vec_kernel_count - self.generated_cpp_vec_kernel_count,
+            ir_nodes_pre_fusion - self.ir_nodes_pre_fusion,
+            cpp_to_dtype_count - self.cpp_to_dtype_count,
+        )
+
+    @staticmethod
+    def apply_deltas(delta: CachedMetricsDeltas):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        generated_kernel_count += delta.generated_kernel_count
+        generated_cpp_vec_kernel_count += delta.generated_cpp_vec_kernel_count
+        ir_nodes_pre_fusion += delta.ir_nodes_pre_fusion
+        cpp_to_dtype_count += delta.cpp_to_dtype_count
+
+
+REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {}
+
+
+@dataclass
+class MetricTable:
+    table_name: str
+    column_names: List[str]
+
+    num_rows_added: int = 0
+
+    def add_row(self, row_fn):
+        if self.table_name not in enabled_metric_tables():
+            return
+
+        row_dict = row_fn()
+        assert len(self.column_names) == len(
+            row_dict
+        ), f"{len(self.column_names)} v.s. {len(row_dict)}"
+        assert set(self.column_names) == set(
+            row_dict.keys()
+        ), f"{set(self.column_names)} v.s. {set(row_dict.keys())}"
+
+        row = [
+            get_benchmark_name(),
+        ]
+        row += [row_dict[column_name] for column_name in self.column_names]
+        self._write_row(row)
+
+    def output_filename(self):
+        return f"metric_table_{self.table_name}.csv"
+
+    def write_header(self):
+        filename = self.output_filename()
+        with open(filename, "w") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(["model_name"] + self.column_names)
+
+    def _write_row(self, row):
+        filename = self.output_filename()
+        if self.num_rows_added == 0 and not os.path.exists(filename):
+            self.write_header()
+
+        self.num_rows_added += 1
+
+        for idx, orig_val in enumerate(row):
+            if isinstance(orig_val, float):
+                new_val = f"{orig_val:.6f}"
+            elif orig_val is None:
+                new_val = ""
+            else:
+                new_val = orig_val
+            row[idx] = new_val
+
+        with open(filename, "a") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(row)
+
+    @staticmethod
+    def register_table(name, column_names):
+        table = MetricTable(name, column_names)
+        REGISTERED_METRIC_TABLES[name] = table
+
+
+MetricTable.register_table(
+    "slow_fusion",
+    [
+        "kernel1_path",
+        "kernel1_latency",
+        "kernel2_path",
+        "kernel2_latency",
+        "fused_kernel_path",
+        "fused_kernel_latency",
+        "slow_down_ratio",
+    ],
+)
+
+# track the fusion statistics for each graph
+MetricTable.register_table(
+    "graph_stats",
+    [
+        "graph_id",
+        "num_nodes_before_fusion",
+        "num_nodes_after_fusion",
+    ],
+)
+
+# track the perf difference between persistent reduction and non-persistent
+# reductions
+MetricTable.register_table(
+    "persistent_red_perf",
+    [
+        "kernel1_name",
+        "kernel2_name",
+        "kernel1_latency",
+        "kernel2_latency",
+        "size_hints",
+        "reduction_hint",
+        "speedup",
+    ],
+)
+
+# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint
+MetricTable.register_table(
+    "kernel_metadata",
+    [
+        "kernel_name",
+        "kernel_path",
+        "kernel_category",  # pointwise/reduction/foreach etc.
+        "size_hints",
+        "reduction_hint",
+        "line_of_code",
+        "num_load",
+        "num_store",
+        "num_for_loop",
+        "num_atomic_add",
+        "num_args",
+        # xyz numel can be different to size_hints since size_hints are rounded
+        # up to the nearest power of 2.
+        # Inductor kernel will burn in the xyz numel in kernel code for static
+        # shape kernels.
+        # Logging them will be helpful to find unaligned shape for reduction
+        "xnumel",
+        "ynumel",
+        "rnumel",
+        "kernel_args_num_gb",
+    ],
+)
+
+
+def _parse_kernel_fn_code(kernel_module_code):
+    """
+    The kernel_module_code is the python module that contains kernel function code.
+    kernel function is the proper triton kernel function annotated with
+    @triton.jit
+    """
+    from .codecache import PyCodeCache
+    from .wrapper_benchmark import get_triton_kernel
+
+    mod = PyCodeCache.load(kernel_module_code)
+    kernel = get_triton_kernel(mod)
+    # kernel is a CachingAutotune; kernel.fn is the JITFunction;
+    # kernel.fn.fn is the function being decorate by triton.jit
+    return inspect.getsource(kernel.fn.fn)
+
+
+def _parse_kernel_line_of_code(proper_kernel_fn_code):
+    """
+    Return the line of code for the kernel excluding the decorators.
+    """
+    return len(proper_kernel_fn_code.splitlines())
+
+
+def _parse_size_hints(kernel_module_code, kernel_category):
+    if kernel_category == "foreach":
+        # foreach kernel does not have size_hints
+        return None
+    m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code)
+    assert m, "size_hints missing!"
+    return m.group(1)
+
+
+def _parse_reduction_hint(kernel_category, kernel_module_code):
+    if kernel_category not in ("reduction", "persistent_reduction"):
+        return None
+    m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code)
+    assert m, "reduction_hint not found in kernel source code!"
+    return m.group(1)
+
+
+def _count_pattern(proper_kernel_fn_code, pattern):
+    return proper_kernel_fn_code.count(pattern)
+
+
+def _count_args(proper_kernel_fn_code):
+    def_line = proper_kernel_fn_code.splitlines()[0]
+    assert def_line.startswith("def ")
+    start_idx = def_line.index("(")
+    end_idx = def_line.index("):")
+    decl_csv = def_line[start_idx + 1 : end_idx]
+    comps = decl_csv.split(",")
+    return len(comps)
+
+
+def _parse_proper_kernel_fn_code(kernel_fn_code):
+    """
+    Skip decorators.
+    """
+    start_pos = kernel_fn_code.index("def ")
+    return kernel_fn_code[start_pos:]
+
+
+def _parse_numel(proper_kernel_fn_code, numel_arg_name):
+    m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code)
+    if m:
+        return int(m.group(1))
+    else:
+        return None
+
+
+def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category):
+    """
+    inductor meta looks like:
+        inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0},
+    """
+    m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code)
+    if m:
+        return float(m.group(1))
+    else:
+        """
+        There are a few cases that kernel_num_gdb field can be missing:
+        1. the field will be missing if config.benchmark_kernel and
+           config.profile_bandwidth are false
+        2. even if config.benchmark_kernel or config.profile_bandwidth is true.
+           foreach kernel does not have kernel_num_gb field in the metadata
+        """
+        return None
+
+
+def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code):
+    """
+    An utility to log kernel metadata. We may parse metadata from kernel source code here.
+
+    It's fine to parse the generated kernel code here since the logging is
+    disabled by default. It would hurt compilation time.
+    """
+    from .wrapper_benchmark import get_kernel_category_by_source_code
+
+    kernel_category = get_kernel_category_by_source_code(kernel_module_code)
+    reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code)
+    size_hints = _parse_size_hints(kernel_module_code, kernel_category)
+    kernel_fn_code = _parse_kernel_fn_code(kernel_module_code)
+
+    proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code)
+
+    # the line of code excluding the decortors
+    kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code)
+
+    get_metric_table("kernel_metadata").add_row(
+        lambda: {
+            "kernel_name": kernel_name,
+            "kernel_path": kernel_path,
+            "kernel_category": kernel_category,
+            "size_hints": size_hints,
+            "reduction_hint": reduction_hint,
+            "line_of_code": kernel_line_of_code,
+            "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"),
+            "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"),
+            "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "),
+            "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"),
+            "num_args": _count_args(proper_kernel_fn_code),
+            "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"),
+            "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"),
+            "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"),
+            "kernel_args_num_gb": _parse_kernel_args_num_gb(
+                kernel_fn_code, kernel_category
+            ),
+        }
+    )
+
+
+def purge_old_log_files():
+    """
+    Purge the old log file at the beginning when the benchmark script runs.
+    Should do it in the parent process rather than the child processes running
+    each individual model.
+    """
+    for name, table in REGISTERED_METRIC_TABLES.items():
+        if name in enabled_metric_tables():
+            filename = table.output_filename()
+            if os.path.exists(filename):
+                os.unlink(filename)
+
+            table.write_header()
+
+
+@lru_cache
+def enabled_metric_tables() -> Set[str]:
+    config_str = config.enabled_metric_tables
+
+    enabled = set()
+    for name in config_str.split(","):
+        name = name.strip()
+        if not name:
+            continue
+        assert (
+            name in REGISTERED_METRIC_TABLES
+        ), f"Metric table name {name} is not registered"
+        enabled.add(name)
+    return enabled
+
+
+def is_metric_table_enabled(name):
+    return name in enabled_metric_tables()
+
+
+def get_metric_table(name):
+    assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
+    return REGISTERED_METRIC_TABLES[name]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f7f3145542b3af36ffc84cb2fc426520e147097
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py
@@ -0,0 +1,344 @@
+import triton
+import triton.language as tl
+
+# In the latest triton, math functions were shuffled around into different modules:
+# https://github.com/openai/triton/pull/3172
+if hasattr(tl.extra.cuda, "libdevice"):
+    libdevice = tl.extra.cuda.libdevice
+    math = tl.math
+else:
+    libdevice = tl.math
+    math = tl
+
+
+@triton.jit
+def promote_to_tensor(x):
+    # Addition promotes to tensor for us
+    return x + tl.zeros((1,), tl.int1)
+
+
+@triton.jit
+def is_floating(x):
+    return promote_to_tensor(x).dtype.is_floating()
+
+
+@triton.jit
+def _prod_accumulate(a, b):
+    return a * b
+
+
+@triton.jit
+def prod(input, axis):
+    return tl.reduce(input, axis, _prod_accumulate)
+
+
+@triton.jit
+def minimum(a, b):
+    mask = a < b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def maximum(a, b):
+    mask = a > b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def min2(a, dim):
+    return tl.reduce(a, dim, minimum)
+
+
+@triton.jit
+def max2(a, dim):
+    return tl.reduce(a, dim, maximum)
+
+
+@triton.jit
+def minimum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value < b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def maximum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value > b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def min_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, minimum_with_index)
+
+
+@triton.jit
+def max_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, maximum_with_index)
+
+
+@triton.jit
+def welford_reduce(value, mean, m2, weight, first_iteration):
+    if first_iteration:
+        new_weight = tl.full(weight.shape, 1, weight.dtype)
+        new_mean = value
+        new_m2 = tl.zeros_like(m2)
+    else:
+        delta = value - mean
+        new_weight = weight + 1
+        new_mean = mean + delta / new_weight
+        new_m2 = m2 + delta * (value - new_mean)
+    return new_mean, new_m2, new_weight
+
+
+@triton.jit
+def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
+    delta = mean_2 - mean_1
+    new_weight = weight_1 + weight_2
+    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
+    return (
+        mean_1 + delta * w2_over_w,
+        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
+        new_weight,
+    )
+
+
+@triton.jit
+def welford(mean, m2, weight, dim):
+    return tl.reduce((mean, m2, weight), dim, welford_combine)
+
+
+@triton.jit
+def device_assert_then(cond, msg, r):
+    tl.device_assert(cond, msg)
+    return r
+
+
+@triton.jit
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+    r0 = r0.to(tl.uint64)
+    r1 = r1.to(tl.uint64)
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % size.to(tl.uint64)
+    result = result.to(tl.int64) + low
+    return result
+
+
+@triton.jit
+def _any_combine(a, b):
+    return a | b
+
+
+@triton.jit
+def any(a, dim):
+    return tl.reduce(a, dim, _any_combine)
+
+
+@triton.jit
+def bucketize_binary_search(
+    values,  # 1D tensor
+    offsets_ptr,
+    indexing_dtype,
+    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
+    OFFSETS_SIZE: int,
+    BLOCK_SHAPE,  # tuple/list of block shape
+):
+    """
+    See [Note: Inductor bucketize op]
+    """
+
+    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
+    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
+
+    full_range = OFFSETS_SIZE + 1
+    while full_range > 1:
+        mid = (high + low) // 2
+        mask = mid < OFFSETS_SIZE
+        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask)
+        if right:
+            is_above = values >= bucket_upper_bound
+        else:
+            is_above = values > bucket_upper_bound
+
+        low = tl.where(is_above & mask, mid + 1, low)
+        high = tl.where(is_above, high, mid)
+
+        full_range = (full_range + 1) // 2
+
+    return low
+
+
+@triton.jit
+def pack_value_flag(
+    value,
+    flag,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
+    return flag.to(DTYPE_PACK) | (uv << bitwidth)
+
+
+@triton.jit
+def unpack_value(
+    pack,
+    DTYPE_VALUE,
+    DTYPE_VALUE_AS_UINT,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
+    return value_uint.to(DTYPE_VALUE, bitcast=True)
+
+
+@triton.jit
+def unpack_flag(pack, DTYPE_FLAG):
+    return pack.to(DTYPE_FLAG)
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback(
+    scratch_base,
+    block_value,
+    index,
+    combine_fn,
+    init,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
+    DTYPE_PACK: Unsigned type twice the width of block_value
+
+    NOTE: This function is limited to values which are 32-bits or less.
+    """
+    DTYPE_VALUE = block_value.dtype
+    pack = pack_value_flag(
+        block_value,
+        tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        # tl.atomic_load
+        flag = tl.full([], 0, DTYPE_VALUE_AS_UINT)
+        while flag == 0:
+            pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed")
+            flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT)
+
+        value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    pack = pack_value_flag(
+        inclusive_prefix,
+        tl.full([], 2, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    return exclusive_prefix
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback_64(
+    scratch_base, block_value, index, combine_fn, init
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block, must be 64-bits wide
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    """
+    block_value_u64 = block_value.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 1, block_value_u64)
+    tl.debug_barrier()
+    flag_one = tl.full([], 1, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release")
+
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        flag = tl.full([], 0, tl.uint64)
+        while flag == 0:
+            flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire")
+
+        value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32))
+        value = value_u64.to(block_value.dtype, bitcast=True)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64)
+    tl.debug_barrier()
+    flag_two = tl.full([], 2, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release")
+
+    return exclusive_prefix
+
+
+@triton.jit
+def frexp(x):
+    # TODO(isuruf): use inline_asm_elementwise here
+    y = libdevice.ilogb(x) + 1
+    exponent = tl.where(x == 0, 0, y)
+    mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y))
+    return mantissa, exponent
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdf19e8d7362272c50d36ff36fcc7c5918a98afb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h
@@ -0,0 +1,401 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSEvent.h>
+#include <ATen/mps/MPSStream.h>
+
+#include <cstdio>
+#include <mutex>
+#include <set>
+#include <unordered_set>
+#include <mach/vm_page_size.h>
+#include <c10/util/flat_hash_map.h>
+
+// this implementation is based on CUDACachingAllocator.
+// It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
+// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
+namespace at::mps::HeapAllocator {
+
+static const size_t kMaxSmallAlloc = MB(1);    // largest "small" allocation is 1 MiB
+static const size_t kMinLargeAlloc = MB(10);   // allocations between 1 and 10 MiB may use kLargeHeap
+static const size_t kRoundLarge    = MB(2);    // round up large allocations to 2 MiB
+static const size_t kSmallHeap     = MB(8);    // "small" allocations are packed in 8 MiB heaps
+static const size_t kLargeHeap     = MB(32);   // "large" allocations may be packed in 32 MiB heaps
+static const size_t kXLargeHeapD   = MB(128);  // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
+static const size_t kXLargeHeapU   = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
+static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
+
+// buffer pools could be customized with a combination of usage flags
+enum UsageFlags : uint32_t {
+  PRIVATE = 0,
+  SMALL   = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
+  SHARED  = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
+  MANAGED = (1 << 2), // managed storage mode
+  HAZARD  = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
+  SCALAR  = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
+};
+// debug verbosity flags
+enum DebugVerbosity : uint32_t {
+  SILENT      = 0,
+  PROFILING   = (1 << 0), // print generic profiling data for total system memory usage
+  ALLOCATIONS = (1 << 1), // print buffer allocations
+  RECYCLES    = (1 << 2), // print buffer recycling
+  RELEASES    = (1 << 3), // print buffer releases
+  LARGE_ONLY  = (1 << 4), // only log large buffer pool transactions
+};
+
+struct HeapBlock;
+
+struct BufferBlock {
+  id<MTLBuffer> buffer;
+  void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
+  size_t size; // size after alignment
+  size_t requested_size; // requested size (before alignment)
+  // buffer shape is used for retrieving base of views in cached graphs
+  std::vector<int64_t> shape;
+  bool in_use = false;
+  HeapBlock* heap;
+  id_t buf_id;
+  // counter to candidate least recently used buffers for garbage collection
+  uint32_t gc_count = 0;
+  uint32_t use_count = 0;
+  // counter to assign unique ids to buffer blocks
+  static uint64_t buffer_counter;
+  // Metal events used to sync GPU/CPU operations on the shared-storage buffers
+  MPSEventPtr event;
+
+  BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
+              HeapBlock* Heap = nullptr) :
+              buffer(Buffer), size(Size), requested_size(RequestedSize),
+              heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
+
+  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
+    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
+  }
+  static size_t alignUp(size_t Size, size_t Alignment) {
+    assert(((Alignment - 1) & Alignment) == 0);
+    return ((Size + Alignment - 1) & ~(Alignment - 1));
+  }
+  uint32_t retainCount() const { return [buffer retainCount]; }
+};
+typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
+
+struct BufferPool;
+struct AllocParams {
+  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
+              search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
+  size_t size() const { return search_key.size; }
+
+  BufferBlock search_key;
+  BufferPool* pool;
+  BufferBlock* buffer_block = nullptr;
+  size_t requested_size;
+  // true if we exceed the low watermark limit. In this case
+  // we apply strategies to relieve the pressure before allocation.
+  bool has_memory_pressure = false;
+  // true if we're allocating on a unified memory device
+  bool has_unified_memory = true;
+};
+
+struct HeapBlock {
+  id<MTLHeap> heap;
+  struct { size_t total, available; } size;
+  BufferPool* pool;
+  unsigned int n_buffers = 0;
+  id_t heap_id;
+  // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
+  bool is_split;
+  // counter to assign unique ids to heap blocks
+  static uint64_t heap_counter;
+
+  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
+            heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
+            heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
+
+  static MTLResourceOptions getOptions(uint32_t usage) {
+    // TODO: check the caching performance of write-combined mode
+    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;
+
+    if (usage & UsageFlags::MANAGED)
+      options |= MTLResourceStorageModeManaged;
+    else if (usage & UsageFlags::SHARED)
+      options |= MTLResourceStorageModeShared;
+    else
+      options |= MTLResourceStorageModePrivate;
+
+    options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
+
+    return options;
+  }
+
+  static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
+    HeapBlock *heapBlock = nullptr;
+    bool is_split = true;
+    const size_t size = params.size();
+    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
+    if (d) {
+      const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
+      if (size <= kMaxSmallAlloc) {
+        d.size = kSmallHeap;
+      } else if (size < kMinLargeAlloc) {
+        d.size = kLargeHeap;
+      } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
+        d.size = kXLargeHeap;
+      } else {
+        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        is_split = false;
+      }
+      d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
+      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+      // this automatically handles Metal buffer access synchronizations at the
+      // cost of slightly lower performance.
+      d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
+      d.resourceOptions = getOptions(usage);
+      d.type = MTLHeapTypeAutomatic;
+      id<MTLHeap> heap = [device newHeapWithDescriptor: d];
+      if (heap) {
+        [heap setPurgeableState:MTLPurgeableStateNonVolatile];
+        const size_t heap_size = heapAvailableSize(heap);
+        heapBlock = new HeapBlock(heap_size, heap, params.pool);
+        if (heapBlock) {
+          heapBlock->is_split = is_split;
+        }
+      }
+      [d release];
+    }
+    return heapBlock;
+  }
+  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
+    return (a->size.available != b->size.available) ? a->size.available < b->size.available :
+                                                      (uintptr_t)a->heap < (uintptr_t)b->heap;
+  }
+  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
+    return [heap maxAvailableSizeWithAlignment:Alignment];
+  }
+  NSUInteger Size() {
+    return [heap size];
+  }
+  id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
+    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
+    if (buf) {
+      updateAvailableSize();
+      n_buffers++;
+    }
+    return buf;
+  }
+  // returns the retainCount before releasing the buffer
+  uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
+    const uint32_t retainCount = [buffer retainCount];
+    [buffer release];
+    buffer = nil;
+    updateAvailableSize();
+    n_buffers--;
+    return retainCount;
+  }
+  // returns the retainCount before releasing the heap
+  uint32_t releaseMTLHeap() {
+    const uint32_t retainCount = [heap retainCount];
+    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
+    [heap setPurgeableState:MTLPurgeableStateEmpty];
+    [heap release];
+    heap = nil;
+    size.available = 0;
+    return retainCount;
+  }
+  uint32_t retainCount() const { return [heap retainCount]; }
+  void updateAvailableSize() { size.available = heapAvailableSize(heap); }
+};
+typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
+
+struct BufferPool {
+  enum class Kind {
+    PRIVATE_SMALL,
+    PRIVATE_LARGE,
+    SHARED_SMALL,
+    SHARED_LARGE,
+    SCALAR,
+  };
+
+  BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
+             device(Device), usage(Usage),
+             heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
+
+  const id<MTLDevice> device;
+  // usage flags to customize the pool for various purposes (see UsageFlags enum)
+  const uint32_t usage;
+  // total number of buffers in the pool
+  uint32_t n_buffers = 0;
+  // total allocations size on this pool
+  size_t allocated_size = 0;
+  // total memory available in the pool
+  size_t available_size = 0;
+  // list of heaps ordered by their "available" (not total) memory size
+  std::set<HeapBlock*, HeapComparison> heaps;
+  // list of only "available" buffers in the pool (i.e., buffers not in-use)
+  std::set<BufferBlock*, BufferComparison> available_buffers;
+  // list of buffers that are in a state of "limbo" where they've already been freed
+  // from PyTorch-side, but were not returned to pool due to still being
+  // in-use by command buffers with retainCount > 1. In this state, the buffer is
+  // neither ready to be recycled, nor could be returned to pool as available.
+  // These buffers will be returned to pool once the command buffer's
+  // completionHandler callbacks are called.
+  std::unordered_set<BufferBlock*> buffers_pending_free;
+  // list of heaps pending size update
+  std::unordered_set<HeapBlock*> heaps_pending_update;
+};
+
+class MPSHeapAllocatorImpl {
+public:
+  explicit MPSHeapAllocatorImpl() :
+    m_device(at::mps::MPSDevice::getInstance()->device()),
+    m_max_buffer_size([m_device maxBufferLength]),
+    m_stream(getDefaultMPSStream()),
+    m_event_pool(getMPSEventPool()) {
+    init_allocator();
+  }
+  ~MPSHeapAllocatorImpl() {
+    emptyCache();
+  }
+  // interface exposed to at::Allocator
+  id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
+  void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
+  void emptyCache();
+  // free inactive buffers that are pending to be freed
+  void freeInactiveBuffers();
+  // returns true if buffer was allocated from the shared pool
+  bool isSharedBuffer(const void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(const void* ptr);
+  // set the shape of a base tensor from a view tensor
+  void setBufferShape(const void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
+  IntArrayRef getBufferShape(const void* ptr);
+  // get the unique ID of the buffer
+  id_t getBufferId(const void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
+  id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
+  // returns a CPU-mapping of the input buffer and its retainCount,
+  // if only it has Shared storage-mode and allocated on MPSAllocator
+  std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
+  // records events for a list of MTLBuffers (list is used to lock the mutex once)
+  // returns true if records any event (given if passed buffers exist and are shared-storage)
+  bool recordEvents(c10::ArrayRef<const void*> buffers);
+  // waits for the event to signal the completion of GPU execution
+  // on the passed shared buffers (list is used to lock the mutex once)
+  // returns true if actually waited on any event
+  bool waitForEvents(c10::ArrayRef<const void*> buffers);
+  // this indicates how far (in Megabytes) the current total allocations are from the
+  // low watermark limit which is used to detect if we're under memory pressure
+  // This returns zero if we've reached the low watermark limit
+  ssize_t getLowWatermarkValue();
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
+  size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
+  // (see m_current_allocated_memory for description)
+  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
+  // total GPU memory allocated in the process by Metal driver; including
+  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
+  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
+  inline id<MTLDevice> Device() const { return m_device; }
+
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  inline std::string format_size(uint64_t size) const;
+
+private:
+  // (see m_high_watermark_ratio for description)
+  constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
+  // (see m_low_watermark_ratio for description)
+  // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
+  constexpr static double default_low_watermark_ratio_unified  = 1.4;
+  constexpr static double default_low_watermark_ratio_discrete = 1.0;
+
+  const id<MTLDevice> m_device;
+  std::recursive_mutex m_mutex;
+  // allocated buffers by device pointer
+  ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
+  // using a container for pools to simplify iterating them
+  ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
+  // total memory allocated by HeapAllocator (including blocks in pools)
+  size_t m_total_allocated_memory = 0;
+  // currently active memory allocations in use (i.e., blocks not in pools)
+  size_t m_current_allocated_memory = 0;
+  // max buffer size allowed by Metal
+  size_t m_max_buffer_size = 0;
+  // maximum total size allowed to be allocated
+  size_t m_max_total_allowed_size = 0;
+  // high watermark ratio is a hard limit for the total allowed allocations
+  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
+  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
+  // allocation size; beyond that, the allocations would fail with OOM error.
+  double m_high_watermark_ratio;
+  // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
+  // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
+  // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
+  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+  // allocation size.
+  double m_low_watermark_ratio;
+  // low watermark size limit (in Bytes) at the time we initialize the allocator
+  size_t m_low_watermark_limit;
+  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
+  uint32_t m_debug_verbosity;
+  // default MPS stream
+  MPSStream* m_stream;
+  // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
+  std::shared_ptr<MPSEventPool> m_event_pool;
+
+  void init_allocator();
+  void init_buffer_pools();
+  HeapBlock* get_free_heap(AllocParams& params);
+  bool get_free_buffer(AllocParams& params);
+  BufferBlock* get_allocated_buffer_block(const void* ptr);
+  BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
+  bool alloc_buffer(AllocParams& params);
+  void free_buffer(BufferBlock* buffer_block);
+  // returns true if the container heap is also released
+  bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
+  void release_buffers(BufferPool& pool);
+  bool release_available_cached_buffers(AllocParams& params);
+  bool release_cached_buffers();
+  // free unused cached blocks to reclaim GPU memory if memory pressure is high
+  void garbage_collect_cached_buffers(AllocParams& params);
+  // returns the suitable buffer pool type for the usage or
+  // requested/allocated sizes
+  BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
+  // returns the aligned allocation size that is optimized
+  // for the buffers to get reused frequently
+  size_t get_allocation_size(size_t size, uint32_t usage) const;
+  // maximum size of device memory available for allocation in current process
+  // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
+  size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
+  // there are implicit allocations from MPS backend, so we need to query the 'device' for
+  // total allocated size instead of manually tracking in MPSAllocator
+  size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
+
+  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
+    for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
+      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
+    }
+    return true;
+  }
+};
+
+} // namespace at::mps::HeapAllocator
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..e30a02c3fb213400eb587654b411fca2ee2b06c4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
@@ -0,0 +1,61 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+
+#define MB(x) (x * 1048576UL)
+
+namespace at::mps {
+
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual void freeInactiveBuffers() const = 0;
+  virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(const void* ptr) const = 0;
+  virtual id_t getBufferId(const void* ptr) const = 0;
+  virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0;
+  virtual bool isSharedBuffer(const void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual std::string formatSize(size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+  virtual size_t getCurrentAllocatedMemory() const = 0;
+  virtual size_t getDriverAllocatedMemory() const = 0;
+  virtual std::pair<const void*, uint32_t> getSharedBufferPtr(const void* ptr) const = 0;
+  virtual bool recordEvents(c10::ArrayRef<const void*> buffers) const = 0;
+  virtual bool waitForEvents(c10::ArrayRef<const void*> buffers) const = 0;
+};
+
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+    ALLOCATION_FAILED // buffer allocation failed
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+
+} // namespace at::mps
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..880ff1c75d12e17ecf719f3de875c2217f852e51
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h
@@ -0,0 +1,100 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSStream.h>
+#include <ctime>
+#include <stack>
+
+namespace at::mps {
+
+// NOTE: don't create instances of this class directly.
+// Use MPSEventPool to acquire instances of MPSEvent.
+class MPSEvent {
+public:
+  explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing);
+  ~MPSEvent();
+
+  // records an event on the stream
+  void record(bool needsLock, bool syncEvent = false);
+  // makes all future work submitted to the stream wait for this event.
+  bool wait(bool needsLock, bool syncEvent = false);
+  // schedules a notifyListener callback for the event.
+  bool notify(bool needsLock, MTLSharedEventNotificationBlock block);
+  // checks if events are already signaled.
+  bool query() const;
+  // blocks the CPU thread until all the GPU work that were scheduled
+  // prior to recording this event are completed.
+  bool synchronize();
+  // resets this event with new parameters in case it gets reused from the event pool
+  void reset(MPSStream* stream, bool enable_timing);
+  // returns the unique ID of the event instance
+  id_t getID() const { return m_id; }
+  // returns the completion timestamp of the event
+  uint64_t getCompletionTime() const { return m_completion_time; }
+  // if already recorded, waits for cpu_sync_cv to be signaled
+  void waitForCpuSync();
+
+private:
+  id_t m_id;
+  // enables measuring the completion time of the notifyListener of this event
+  bool m_enable_timing;
+  uint64_t m_signalCounter = 0;
+  MPSStream* m_stream = nullptr;
+  MTLSharedEvent_t m_event = nullptr;
+  MTLSharedEventListener* m_listener = nullptr;
+  // used to sync the events created on this Stream with CPU
+  std::mutex m_cpu_sync_mutex{};
+  std::condition_variable m_cpu_sync_cv{};
+  // CondVar predicate to sync the events created on this Stream with CPU
+  bool m_cpu_sync_completed = false;
+  // used to compute elapsed time
+  uint64_t m_completion_time = 0;
+
+  void recordLocked(bool syncEvent);
+  bool waitLocked(bool syncEvent);
+  bool notifyLocked(MTLSharedEventNotificationBlock block);
+  void notifyCpuSync();
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+typedef std::unique_ptr<MPSEvent, std::function<void(MPSEvent*)>> MPSEventPtr;
+
+class MPSEventPool {
+public:
+  explicit MPSEventPool(MPSStream* default_stream);
+  ~MPSEventPool();
+
+  MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream);
+  void emptyCache();
+
+  // these are mainly used for MPSHooks and torch.mps.Event() bindings
+  id_t acquireEvent(bool enable_timing);
+  void releaseEvent(id_t event_id);
+  void recordEvent(id_t event_id, bool syncEvent);
+  void waitForEvent(id_t event_id, bool syncEvent);
+  void synchronizeEvent(id_t event_id);
+  bool queryEvent(id_t event_id);
+  // returns elapsed time between two recorded events in milliseconds
+  double elapsedTime(id_t start_event_id, id_t end_event_id);
+
+private:
+  MPSStream* m_default_stream = nullptr;
+  std::recursive_mutex m_mutex;
+  std::stack<std::unique_ptr<MPSEvent>> m_pool{};
+  // dictionary to associate event IDs with event objects
+  // used to retain in-use events out of the pool
+  // for torch.mps.Event() bindings.
+  std::unordered_map<id_t, MPSEventPtr> m_in_use_events{};
+  uint64_t m_event_counter = 0;
+  std::function<void(MPSEvent*)> m_default_deleter;
+
+  MPSEvent* getInUseEvent(id_t event_id, bool locked = true);
+};
+
+// shared_ptr is used to get MPSEventPool destroyed after dependent instances
+std::shared_ptr<MPSEventPool> getMPSEventPool();
+
+} // namespace at::mps
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..994c50ad9e61c6b0634d80ec3921824f662de59c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h
@@ -0,0 +1,393 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+
+#include <os/signpost.h>
+#include <os/log.h>
+
+#include <sstream>
+#include <string>
+#include <atomic>
+#include <unordered_map>
+#include <utility>
+#include <ctime>
+
+namespace at::mps {
+
+namespace Profiler {
+
+struct BaseInfo {
+  // profiling info types
+  enum class Type {
+    GRAPH,
+    KERNEL,
+    COPY,
+    CPU_FALLBACK,
+  };
+
+  BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
+      type(infoType), profileId(Id), handle(Handle) { }
+  virtual ~BaseInfo() = default;
+
+  // type of profiling info
+  Type type;
+  // unique profile ID for execution instances of operations or copies
+  uint64_t profileId;
+  // ID generated by os_signpost
+  // since it's possible to use event and interval-based signposts at the
+  // same time, we need separate IDs for each.
+  os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
+  // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
+  std::atomic<double> totalGpuTime{0.0};
+  // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
+  std::atomic<double> totalSchedulingTime{0.0};
+  // indicates if the operation or copy execution has completed
+  std::atomic_bool completed{false};
+  // handle used to identify the profile info's instance (usually the pointer)
+  const uintptr_t handle;
+
+  virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
+  // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
+  static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
+    if (tensor.defined()) {
+      std::stringstream tensorStr;
+      auto deviceType = tensor.device().type();
+      tensorStr << c10::DeviceTypeName(deviceType);
+      // see comments for INCLUDE_BUFFER_ID
+      if (includeBufferId && deviceType == at::kMPS) {
+        id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+        tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
+                  << ":" << buffer.retainCount << ")";
+      }
+      tensorStr << ":"
+                << tensor.scalar_type() << tensor.sizes();
+      return tensorStr.str();
+    } else {
+      return "undefined";
+    }
+  }
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+struct OperationInfo : BaseInfo {
+  OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
+      BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
+
+  uint64_t runCount = 0;
+  std::string strKey;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  // builds a string for a kernel
+  static std::string buildKernelString(const std::string& kernelName,
+                                       const TensorList& tensors,
+                                       bool includeBufferId = false) {
+    std::stringstream kernelStr;
+    kernelStr << kernelName;
+    for (const Tensor& tensor: tensors) {
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+    }
+    return kernelStr.str();
+  }
+};
+
+struct CpuFbInfo : BaseInfo {
+  CpuFbInfo(uint64_t Id, const std::string& OpName) :
+      BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
+
+  uint64_t runCount = 0;
+  // the current and total overhead of copies in bytes required to convert the Op's
+  // input tensors from MPS to CPU and then output from CPU back to MPS
+  size_t currentCopyOverhead = 0;
+  size_t totalCopyOverhead = 0;
+  std::string opName;
+  std::string strKey;
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  void updateCopyOverhead(const TensorList& tensors) {
+    currentCopyOverhead = 0;
+    for (const Tensor& tensor: tensors) {
+      if (tensor.defined()) {
+        currentCopyOverhead += tensor.nbytes();
+      }
+    }
+    totalCopyOverhead += currentCopyOverhead;
+  }
+};
+
+struct CopyInfo : BaseInfo {
+  enum class Kind {
+    MPS_TO_MPS,
+    MPS_TO_CPU,
+    CPU_TO_MPS,
+  };
+
+  CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
+           BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
+           length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
+
+  Kind kind;
+  size_t length;
+  bool isNonBlocking;
+  bool usesBlitter;
+  std::string srcStrKey;
+  std::string dstStrKey;
+  // for copies that don't use blitters, we measure CPU time
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
+
+  static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
+    if (tensor.has_value()) {
+      return tensor->device().type() == at::kMPS;
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
+    // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
+    return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
+  }
+
+  static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
+                          const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
+    const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
+    const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
+    if (isSrcOnMPS && !isDstOnMPS) {
+      return Kind::MPS_TO_CPU;
+    } else if (!isSrcOnMPS && isDstOnMPS) {
+      return Kind::CPU_TO_MPS;
+    }
+    return Kind::MPS_TO_MPS;
+  }
+};
+
+struct CopyStat : CopyInfo {
+  explicit CopyStat(std::string CopyKindStr) :
+          CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
+  // total number of copies
+  size_t totalCount = 0;
+  // number of Scalar copies (i.e., less than sizeof(int64))
+  size_t scalarsCount = 0;
+  // number of blocking copies (i.e., require syncing to GPU)
+  size_t blockingCount = 0;
+  // number of copies that used memcpy(), instead of Metal Blit Encoder
+  size_t memcpyCount = 0;
+  // accumulated GPU time in ms for the scalar copies
+  std::atomic<double> scalarsGpuTime{0.0};
+  // copy kind in string type
+  std::string kindStr;
+};
+
+class MPSProfiler {
+public:
+  // lower 16 bits used for profiler options
+  enum ProfileOptions : uint32_t {
+    OPTIONS_NONE = 0,
+    // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
+    // (used for convenience to not compute bit flags by OR-ing manually)
+    // trace all signpost types using events
+    ALL_SIGNPOST_EVENTS    = (1 << 0),
+    // trace all signpost types using intervals
+    ALL_SIGNPOST_INTERVALS = (1 << 1),
+    // always wait for command buffer to finish executing after each commit
+    WAIT_UNTIL_COMPLETED   = (1 << 2),
+    // for interval-based signposts, include the scheduling portion of
+    // Graph/Kernel/Copy executions as well.
+    // if flag is disable, only "GPU run time" is included in interval,
+    // and not schedule time.
+    INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
+
+    // use these if you need to trace signposts types individually (rarely required)
+    // trace signpost using intervals
+    USE_INTERVALS = (1 << 4),
+    // trace signpost by emitting events
+    USE_EVENTS    = (1 << 5),
+    // used for sanity check (Change this when new option added)
+    OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
+  };
+
+  // when adding new types, #define the type string in MPSProfiler.mm as well.
+  // upper 16 bits used for event types
+  enum SignpostTypes : uint32_t {
+    SIGNPOST_NONE = 0,
+    // trace signposts for PyTorch operation executions
+    RUN_OPERATION = (1 << 16),
+    // trace signposts for blitter copies
+    BLIT_COPY     = (1 << 17),
+    // trace signposts for ops that fall back on CPU
+    CPU_FALLBACK  = (1 << 18),
+    // used for sanity check (Change this when new type added)
+    SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
+  };
+
+  enum LogOptions : uint32_t {
+    LOG_NONE = 0,
+
+    // Info logging options during execution
+    // -------------------------------------
+    // prints operation info (id/key/run_count) during execution
+    OPERATION_INFO      = (1 << 0),
+    // prints copy info (src/dst tensors/buffers, size, etc.) during execution
+    COPY_INFO           = (1 << 1),
+    // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
+    CPU_FALLBACK_INFO   = (1 << 2),
+
+    // Profiling Statistics logging options when process terminates
+    // ------------------------------------------------------------
+    // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
+    // this is convenient to not combine following stats bit flags manually
+    ALL_STATS           = (1 << 3),
+    // prints operation stats (GPU times, run count, etc.) before process terminates
+    OPERATION_STATS     = (1 << 4),
+    // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
+    COPY_STATS          = (1 << 5),
+    // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
+    // for tensors, etc.) before process terminates
+    CPU_FALLBACK_STATS  = (1 << 6),
+
+    // Metadata format options when logging the info
+    // ---------------------------------------------
+    // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
+    // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
+    INCLUDE_GPU_TIME    = (1 << 7),
+    // if enabled, includes GPU scheduling time in metadata separately
+    // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
+    // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
+    INCLUDE_KERNEL_TIME = (1 << 8),
+    // if enabled, includes the unique buffer ID in metadata for the storage
+    // of a tensor that was allocated on MPSAllocator. This is useful (along with
+    // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
+    // with various operations.
+    INCLUDE_BUFFER_ID   = (1 << 9),
+
+    // used for sanity check (Change this when new option added)
+    LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
+  };
+
+  explicit MPSProfiler();
+  ~MPSProfiler();
+
+  // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
+  uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
+  uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
+  uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
+                            const OptionalTensorRef srcTensor,
+                            const OptionalTensorRef dstTensor,
+                            size_t length, bool isNonBlocking, bool usesBlitter = true);
+  uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
+  void beginProfileGPUInterval(const void* handle);
+
+  void endProfileCopy(uint64_t profileId, SyncType syncType);
+  void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
+  void endProfileCPUFallback(const std::string& opName);
+
+  // these are used to hook into Python bindings for torch.mps.profiler module.
+  // this enables generating OS Signpost traces from MPSProfiler on-demand
+  // during runtime (instead of environment variables).
+  // The "mode" could be either "interval", "event", or both "interval,event"
+  // for interval-based and/or event-based signpost tracing.
+  void StartTrace(const string& mode, bool waitUntilCompleted);
+  void StopTrace();
+
+  // convenience functions to indicate whether signpost tracing or
+  // logging are enabled for the SignpostTypes
+  bool isOperationProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
+           (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
+  }
+  bool isCopyProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
+           (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
+  }
+  bool isCPUFallbackProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
+           (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
+  }
+  bool isSignpostTracingEnabled() const {
+    return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
+  }
+
+ private:
+  // indicates what type of signpost types are enabled and traced by MPS profiler.
+  uint32_t m_signpost_types = 0;
+  uint32_t m_profile_options = 0;
+  uint32_t m_log_options = 0;
+  uint64_t m_kernel_counter = 0;
+  uint64_t m_graph_counter = 0;
+  uint64_t m_cpu_fb_counter = 0;
+  uint64_t m_copy_counter = 0;
+  // technically, it's possible to trace both events and intervals at the same time
+  // so we use separate os_log categories for them
+  os_log_t m_os_log_events;
+  os_log_t m_os_log_intervals;
+  // stats logging could run either from destructor or signal handler
+  // so this is used to check if logging has already started.
+  std::atomic_bool hasLoggedStats{false};
+  // indicates there are pending completionHandler callbacks that haven't been called yet.
+  std::atomic_bool hasPendingCompletionHandlers{false};
+  // used to capture sigint signal to log profiling stats
+  static struct sigaction currentSigint, previousSigint;
+
+  // We use the following lists for two reasons:
+  // 1- for interval-based signposts the "begin" point won't be in same function
+  // as the "end" point where we need to be able to retrieve signpost's info
+  // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
+
+  // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
+  // the string key for this map is the op name that we fall back to execute on CPU
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
+  // this list contains the info for copies, and its key is the unique profileId
+  // which is generated from m_copy_counter
+  // The copyInfo list is not retained.
+  std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
+  // a short list that contains copy stats
+  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
+
+  void initialize();
+  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
+  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
+                           os_signpost_id_t interval_signpost_id,
+                           double gpuTime, double schedulingTime);
+  void addProfilerScheduledHandler(BaseInfo& info);
+  void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
+  void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                         const std::string& msg) const;
+  void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                             const std::string& msg) const;
+  void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
+
+  void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
+  // returns true if logging the profiling info "during the execution" is enabled
+  bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
+  // logs all the profiling stats that are enabled
+  void logProfilingStats();
+  // logs kernel profiling stats when the process ends.
+  void logOperationsProfilingStats(std::FILE* f) const;
+  // logs CPU Fallback profiling stats when the process ends.
+  void logCPUFallbackProfilingStats(std::FILE* f) const;
+  // logs copy profiling stats when the process ends.
+  void logCopyProfilingStats(std::FILE* f) const;
+
+  os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
+  static SignpostTypes getSignpostType(BaseInfo::Type infoType);
+  static void handleIntSignal(int signal);
+};
+
+} // namespace Profiler
+
+Profiler::MPSProfiler& getMPSProfiler();
+
+} // namespace at::mps
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h
new file mode 100644
index 0000000000000000000000000000000000000000..efbe7ce1b9d1ca650dffe238740e3f08a950f07f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h
@@ -0,0 +1,321 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+#include <ATen/Config.h>
+#include <ATen/native/DispatchStub.h>
+
+// Forward declare TI
+namespace at {
+class Tensor;
+struct TensorIterator;
+
+namespace native {
+enum class TransposeType;
+}
+
+}
+
+namespace at::native {
+
+enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss};
+
+#if AT_BUILD_WITH_LAPACK()
+// Define per-batch functions to be used in the implementation of batched
+// linear algebra operations
+
+template <class scalar_t>
+void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info);
+
+template <class scalar_t>
+void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info);
+
+template <class scalar_t, class value_t=scalar_t>
+void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info);
+
+template <class scalar_t>
+void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t>
+void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t>
+void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info);
+
+template <class scalar_t>
+void lapackGels(char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsd(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t* work, int lwork,
+    value_t *rwork, int* iwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsy(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    int *jpvt, value_t rcond, int *rank,
+    scalar_t *work, int lwork, value_t* rwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelss(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t *work, int lwork,
+    value_t *rwork, int *info);
+
+template <LapackLstsqDriverType, class scalar_t, class value_t = scalar_t>
+struct lapackLstsq_impl;
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gels, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGels<scalar_t>(
+        trans, m, n, nrhs,
+        a, lda, b, ldb,
+        work, lwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsy, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsy<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        jpvt, rcond, rank,
+        work, lwork, rwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsd, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsd<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, iwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelss, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelss<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, info);
+  }
+};
+
+template <LapackLstsqDriverType driver_type, class scalar_t, class value_t = scalar_t>
+void lapackLstsq(
+    char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info, // Gels flavor
+    int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+    value_t *s, // Gelss flavor
+    int *iwork // Gelsd flavor
+    ) {
+  lapackLstsq_impl<driver_type, scalar_t, value_t>::call(
+      trans, m, n, nrhs,
+      a, lda, b, ldb,
+      work, lwork, info,
+      jpvt, rcond, rank, rwork,
+      s,
+      iwork);
+}
+
+template <class scalar_t>
+void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
+
+template <class scalar_t>
+void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
+
+template <class scalar_t>
+void lapackLdlHermitian(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSymmetric(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveHermitian(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveSymmetric(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
+template<class scalar_t, class value_t=scalar_t>
+void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
+#endif
+
+#if AT_BUILD_WITH_BLAS()
+template <class scalar_t>
+void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb);
+#endif
+
+using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_fn, cholesky_stub);
+
+using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/);
+
+DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub);
+
+using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/);
+
+DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub);
+
+using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(geqrf_fn, geqrf_stub);
+
+using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(orgqr_fn, orgqr_stub);
+
+using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/);
+DECLARE_DISPATCH(ormqr_fn, ormqr_stub);
+
+using linalg_eigh_fn = void (*)(
+    const Tensor& /*eigenvalues*/,
+    const Tensor& /*eigenvectors*/,
+    const Tensor& /*infos*/,
+    bool /*upper*/,
+    bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub);
+
+using lstsq_fn = void (*)(
+    const Tensor& /*a*/,
+    Tensor& /*b*/,
+    Tensor& /*rank*/,
+    Tensor& /*singular_values*/,
+    Tensor& /*infos*/,
+    double /*rcond*/,
+    std::string /*driver_name*/);
+DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
+
+using triangular_solve_fn = void (*)(
+    const Tensor& /*A*/,
+    const Tensor& /*B*/,
+    bool /*left*/,
+    bool /*upper*/,
+    TransposeType /*transpose*/,
+    bool /*unitriangular*/);
+DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub);
+
+using lu_factor_fn = void (*)(
+    const Tensor& /*input*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*infos*/,
+    bool /*compute_pivots*/);
+DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub);
+
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  const int64_t dim_size,
+  const int64_t max_pivot);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+
+using lu_solve_fn = void (*)(
+    const Tensor& /*LU*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*B*/,
+    TransposeType /*trans*/);
+DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub);
+
+using ldl_factor_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*info*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub);
+
+using svd_fn = void (*)(
+    const Tensor& /*A*/,
+    const bool /*full_matrices*/,
+    const bool /*compute_uv*/,
+    const c10::optional<c10::string_view>& /*driver*/,
+    const Tensor& /*U*/,
+    const Tensor& /*S*/,
+    const Tensor& /*Vh*/,
+    const Tensor& /*info*/);
+DECLARE_DISPATCH(svd_fn, svd_stub);
+
+using ldl_solve_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*result*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub);
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e61f280bf5922e911cdbc180a3503d75d8a06f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h
@@ -0,0 +1,139 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/Config.h>
+#include <cstdint>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/FbgemmEmbedding.h>
+#endif
+
+namespace at::native {
+
+void check_arguments(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    bool include_last_offset);
+
+void make_bag_size_out(
+    Tensor& bag_size_out,
+    const Tensor& offsets,
+    const Tensor& indices,
+    const int64_t mode,
+    const bool include_last_offset,
+    const bool requires_grad);
+
+void make_max_indices_out(
+    Tensor& max_indices_out,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& bag_size,
+    const int64_t mode,
+    bool include_last_offset);
+
+void make_offset2bag_out(
+    Tensor& offset2bag,
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    const int64_t padding_idx = -1);
+
+#ifdef USE_FBGEMM
+
+template<bool has_weight, typename TIndex, typename TData>
+struct _CallbackAndBlockSize {
+    using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature<TData, TIndex, TIndex, TData>::Type;
+
+    int64_t blockSize = -1;
+    TCallback callback = nullptr;
+
+    static TCallback generateCallback(int64_t block_size) {
+        return fbgemm::GenerateEmbeddingSpMDM<TData, TIndex, TIndex, TData>(
+                block_size,
+                has_weight,
+                /* normalize_by_lengths */false,
+                /* prefetch */16,
+                /* is_weight_positional */false,
+                /* use_offsets */true);
+    }
+
+    _CallbackAndBlockSize() = default;
+
+    explicit _CallbackAndBlockSize(c10::optional<int64_t> maybe_block_size)
+      : blockSize(maybe_block_size.value_or(-1))
+      , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
+    {}
+};
+
+template<typename... StorageMixins>
+struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
+
+    _EmbeddingBagKernelCacheImpl() = default;
+    // use each of the mixins to store corresponding kernel and block size
+    explicit _EmbeddingBagKernelCacheImpl(c10::optional<int64_t> maybe_block_size)
+      : StorageMixins(maybe_block_size)...
+    {}
+
+    // this method is thread safe (call sites may call from different threads)
+    template<bool has_weight, typename TIndex, typename TData>
+    typename _CallbackAndBlockSize<has_weight, TIndex, TData>::TCallback
+    getCallback(int64_t block_size) const {
+        // if the cache doesn't store the kernel for the incoming block size
+        // (so it is different from the one stored in corresponding mixin)
+        // regenerate the kernel (not writing it into the cache so we avoid locks)
+        if (block_size != _CallbackAndBlockSize<has_weight, TIndex, TData>::blockSize) {
+            return _CallbackAndBlockSize<has_weight, TIndex, TData>::generateCallback(block_size);
+        }
+        // else retrieve the cached kernel from the corresponding mixin
+        return _CallbackAndBlockSize<has_weight, TIndex, TData>::callback;
+    }
+};
+
+// instantiate the cache with the list of storage mixins
+// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
+using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
+    _CallbackAndBlockSize<true, int32_t, float>,
+    _CallbackAndBlockSize<false, int32_t, float>,
+    _CallbackAndBlockSize<true, int64_t, float>,
+    _CallbackAndBlockSize<false, int64_t, float>,
+    _CallbackAndBlockSize<true, int32_t, unsigned short>,
+    _CallbackAndBlockSize<false, int32_t, unsigned short>,
+    _CallbackAndBlockSize<true, int64_t, unsigned short>,
+    _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+#else
+struct _EmbeddingBagKernelCache {
+    explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
+};
+#endif
+
+void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
+    Tensor& bag_size, Tensor* max_indices,
+    const Tensor &weight, const Tensor &indices,
+    const Tensor &offsets, const int64_t mode = 0,
+    const c10::optional<Tensor>& per_sample_weights = c10::nullopt,
+    bool include_last_offset = false,
+    int64_t padding_idx = -1,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    const bool sparse,
+    const c10::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const c10::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6de9580ae7c33340d2929c4c5f743e4aaf42339
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h
@@ -0,0 +1,21 @@
+// Functions that fill Tensors with constants. Implementations are in Fill.cpp.
+
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+class Tensor;
+struct TensorIterator;
+
+namespace native {
+
+DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub);
+
+Tensor& fill_out(Tensor& self, const Scalar& value);
+
+}} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h
new file mode 100644
index 0000000000000000000000000000000000000000..f21269620f25345644a91dd59009f7e2c8c99121
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+
+namespace at::native {
+namespace {
+  static C10_UNUSED void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+          target.dim() <= 1 && target.numel() == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+          target.dim() == 2 && target.size(0) == nframe &&
+          target.size(1) == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    }
+  }
+
+  static C10_UNUSED void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+
+    TORCH_CHECK(
+        target.dim() <= 1 && target.numel() == nframe,
+        "inconsistent target size, expected ", nframe, " but got ",
+        target.sizes());
+    if (weight && weight->defined()) {
+      TORCH_CHECK(
+          weight->dim() <= 1 && weight->numel() == dim,
+          "inconsistent weight size, expected ", dim, " but got ",
+          weight->sizes());
+    }
+}
+
+
+}  // anonymous namespace
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cd4dcde370522874311f43dbcdfca0e16bc5035
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm);
+DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub);
+
+}  // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h
new file mode 100644
index 0000000000000000000000000000000000000000..068482ee300c73d7bd185482f85d109e400e9cc8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+
+struct TensorIterator;
+struct TensorIteratorBase;
+
+namespace native {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define HOST_DEVICE __host__ __device__
+#else
+#define HOST_DEVICE
+#endif
+
+// integral power in pytorch allows for negative exponents, giving truncated integral results.
+// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the
+// only non-zero result.
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE __ubsan_ignore_signed_int_overflow__ T powi_impl(T a, T b) {
+  T result = 1;
+  while (b) {
+    if (b & 1) {
+       result *= a;
+    }
+    b /= 2;
+    a *= a;
+  }
+  return result;
+}
+
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  return powi_impl(a, b);
+}
+
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  if ( b < 0 ) {
+      if ( a == 1 ) {
+          return 1;
+      } else if ( a == -1 ) {
+          auto negative = (-b) % static_cast<T>(2);
+          return negative ? -1 : 1;
+      } else {
+          return 0;
+      }
+  }
+  return powi_impl(a, b);
+}
+
+using pow_tensor_tensor_fn = void (*)(TensorIteratorBase&);
+using pow_tensor_scalar_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+
+DECLARE_DISPATCH(pow_tensor_tensor_fn, pow_tensor_tensor_stub);
+DECLARE_DISPATCH(pow_tensor_scalar_fn, pow_tensor_scalar_stub);
+
+} // namespace native
+
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..604d6ae8a74ef2ee617d11868859c90541efb3d3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+struct TensorIterator;
+class Tensor;
+}
+
+namespace at::native {
+
+using reduce_fn = void(*)(TensorIterator &);
+
+DECLARE_DISPATCH(reduce_fn, sum_stub);
+DECLARE_DISPATCH(reduce_fn, nansum_stub);
+DECLARE_DISPATCH(reduce_fn, prod_stub);
+DECLARE_DISPATCH(reduce_fn, mean_stub);
+DECLARE_DISPATCH(reduce_fn, and_stub);
+DECLARE_DISPATCH(reduce_fn, or_stub);
+DECLARE_DISPATCH(reduce_fn, min_values_stub);
+DECLARE_DISPATCH(reduce_fn, max_values_stub);
+DECLARE_DISPATCH(reduce_fn, argmax_stub);
+DECLARE_DISPATCH(reduce_fn, argmin_stub);
+
+using reduce_std_var_function =
+    void (*)(TensorIterator&, double correction, bool take_sqrt);
+DECLARE_DISPATCH(reduce_std_var_function, std_var_stub);
+
+using reduce_norm_fn =
+    void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+
+using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&);
+DECLARE_DISPATCH(reduce_fn_flag, norm_stub);
+
+using structured_cum_fn = void (*)(const Tensor&, const Tensor&, int64_t);
+using cum_fn = void (*)(Tensor&, const Tensor&, int64_t);
+DECLARE_DISPATCH(structured_cum_fn, cumsum_stub);
+DECLARE_DISPATCH(structured_cum_fn, cumprod_stub);
+DECLARE_DISPATCH(cum_fn, logcumsumexp_stub);
+
+DECLARE_DISPATCH(void (*)(const Tensor&, int64_t, bool, Tensor&, Tensor&), aminmax_stub);
+DECLARE_DISPATCH(void (*)(const Tensor&, Tensor&, Tensor&), aminmax_allreduce_stub);
+
+// Used in cuda/Normalization.cu
+TORCH_API std::tuple<Tensor&,Tensor&> var_mean_out(
+    Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim,
+    int64_t correction, bool keepdim);
+
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e42ebe84a0e8b0906a76ba9c937c6c46027caa
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h
@@ -0,0 +1,55 @@
+/// This file contains some tensor-agnostic operations to be used in the
+/// core functions of the `SobolEngine`
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/pow.h>
+#endif
+
+namespace at::native::sobol_utils {
+
+/// Function to return the minimum of number of bits to represent the integer `n`
+inline int64_t bit_length(const int64_t n) {
+  int64_t nbits, nloc;
+  for (nloc = n, nbits = 0; nloc > 0; nloc /= 2, nbits++);
+  return nbits;
+}
+
+/// Function to get the position of the rightmost zero in the bit representation of an integer
+/// This value is the zero-indexed position
+inline int64_t rightmost_zero(const int64_t n) {
+  int64_t z, i;
+  for (z = n, i = 0; z % 2 == 1; z /= 2, i++);
+  return i;
+}
+
+/// Function to get a subsequence of bits in the representation of an integer starting from
+/// `pos` and of length `length`
+inline int64_t bitsubseq(const int64_t n, const int64_t pos, const int64_t length) {
+  return (n >> pos) & ((1 << length) - 1);
+}
+
+/// Function to perform the inner product between a batched square matrix and a power of 2 vector
+inline at::Tensor cdot_pow2(const at::Tensor& bmat) {
+  at::Tensor inter = at::arange(bmat.size(-1) - 1, -1, -1, bmat.options());
+  inter = at::pow(2, inter).expand_as(bmat);
+  return at::mul(inter, bmat).sum(-1);
+}
+
+/// All definitions below this point are data. These are constant, and should not be modified
+/// without notice
+
+constexpr int64_t MAXDIM = 21201;
+constexpr int64_t MAXDEG = 18;
+constexpr int64_t MAXBIT = 30;
+constexpr int64_t LARGEST_NUMBER = 1 << MAXBIT;
+constexpr float RECIPD = 1.0 / LARGEST_NUMBER;
+
+extern const int64_t poly[MAXDIM];
+extern const int64_t initsobolstate[MAXDIM][MAXDEG];
+
+} // namespace at::native::sobol_utils
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4dfa689b1d216cb697076781935afb81a587fae
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+class Tensor;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+
+namespace at::native {
+
+using reduce_minmax_fn =
+    void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+using structured_reduce_minmax_fn =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, int64_t, bool);
+
+DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub);
+DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub);
+
+using where_fn = void (*)(TensorIterator &);
+DECLARE_DISPATCH(where_fn, where_kernel);
+
+using is_infinity_op_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub);
+DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub);
+
+using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+DECLARE_DISPATCH(mode_fn, mode_stub);
+
+using clamp_tensor_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub);
+
+namespace detail {
+    enum class ClampLimits {Min, Max, MinMax};
+}
+
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub);
+
+using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
+DECLARE_DISPATCH(isin_default_fn, isin_default_stub);
+
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e55d2a58d709926a24467a0056323096e0890fa9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/TensorIterator.h>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc56fa6457e75bc980747afc9d2d72257d6c093b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
@@ -0,0 +1,57 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+
+namespace at::native {
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches for Triu and Tril. This ignores stride 0 dimension
+ */
+static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    if (batched_matrices.stride(i) != 0) {
+      result *= batched_matrices.size(i);
+    }
+  }
+  return result;
+}
+
+/* Checks a necessary property for the triu and tril implementations, hence the name.
+ * Here batch contiguity is checked for tensors with greater than 4 dimensions.
+ * Contiguous tensors and tensors with less than 3 dimensions pass this check
+ */
+static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) {
+  // Complete contiguity is the most desired property, which is why
+  // we return true if the tensor is contiguous
+  if (tensor.is_contiguous()) {
+    auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes());
+    if (tensor.strides() == default_strides_for_size) {
+      return std::make_tuple(true, tensor);
+    } else {
+      return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size));
+    }
+  }
+
+  int64_t dims = tensor.dim();
+
+  // Tensors with dimension less than 4 are handled by default
+  if (allow_zero_stride && dims <= 3) {
+    return std::make_tuple(true, tensor);
+  }
+
+  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
+  for (int64_t i = dims - 3; i >= 0; i--) {
+    // Skip trivial dimension;
+    if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) {
+      continue;
+    }
+    if (expected_stride != tensor.stride(i)) {
+      return std::make_tuple(false, tensor.contiguous());
+    }
+    expected_stride *= tensor.size(i);
+  }
+  return std::make_tuple(true, tensor);
+}
+
+}  // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h
new file mode 100644
index 0000000000000000000000000000000000000000..192177cc9bcfb0988171fb68554ab56a7120ed4c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h
@@ -0,0 +1,62 @@
+#pragma once
+
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+
+// n: number of function arguments (arity)
+// traits: function_traits (see FunctionTraits.h)
+// s: index of scalar argument or -1
+template <int n, int stride_index, typename traits, int s=-1>
+struct IsContiguous {
+  static bool eval(const int64_t* strides) {
+    using type = typename traits::template arg<n - 1>::type;
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
+  }
+};
+
+// will be called when there is an output exists
+template <typename traits, int s>
+struct IsContiguous<0, 0, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return strides[0] == sizeof(typename traits::result_type);
+  }
+};
+
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* /*strides*/) {
+    return true;
+  }
+};
+
+// output and all inputs are contiguous
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+
+template <typename traits,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
+}
+
+// input at `s` is scalar (stride 0); output and other inputs are contiguous
+// NB: output is typically at strides[0] so first input corresponds to s=1
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+
+template <typename traits, int s,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
+}
+
+}}}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee9fac647ad6241c97e28a7af6f091d5d613bc3a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using forward_fn = void (*)(const Tensor&, const Tensor&);
+using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&);
+
+DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel);
+DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel);
+
+using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t);
+using backward_fn_with_dim =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t);
+
+DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel);
+DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel);
+}
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e764cc4ce803905a88363f00291a2066d76bb274
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh
@@ -0,0 +1,296 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+// Jiterator functions are guarded behind this macro
+#if AT_USE_JITERATOR()
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <ATen/native/cuda/Loops.cuh>
+
+#include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/SmallBuffer.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <tuple>
+#include <mutex>
+
+namespace at {
+namespace native {
+
+template <typename Tuple, std::size_t... I>
+constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence<I...> seq) {
+    constexpr auto size = seq.size();
+    (void)t; // warning : unused parameter when tuple is empty.
+    return std::array<void*, size>{static_cast<void*>(&std::get<I>(t))...};
+}
+
+// Helper function convert tuple to std::array<void*, N>
+// for passing the arguments to CUDA Kernel
+// NOTE: We capture tuple by reference,
+// so the pointers in returned array are only valid
+// till tuple is alive.
+template <typename ...Args>
+constexpr auto tuple_to_array(std::tuple<Args...>& extra_args) {
+    constexpr auto tuple_size = sizeof...(Args);
+    return tuple_to_array_helper(extra_args, std::make_index_sequence<tuple_size>{});
+}
+
+struct JittedVecKernelCache {
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  at::cuda::jit::NvrtcFunction vec1;
+  at::cuda::jit::NvrtcFunction vec2;
+  at::cuda::jit::NvrtcFunction vec4;
+};
+
+struct JittedKernelVariantCache {
+  JittedVecKernelCache vec;
+  at::cuda::jit::NvrtcFunction noncontiguous;
+  at::cuda::jit::NvrtcFunction dynamic_contiguous;
+  at::cuda::jit::NvrtcFunction dynamic_noncontiguous;
+};
+
+inline c10::SmallBuffer<void*, 64> pack_kernel_args(
+    std::initializer_list<void*> args,
+    c10::ArrayRef<void*> extra_args) {
+  c10::SmallBuffer<void*, 64> ret(args.size() + extra_args.size());
+  std::copy(args.begin(), args.end(), ret.data());
+  std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size());
+  return ret;
+}
+
+template<typename array_t,
+         typename inp_calc_t,
+         typename out_calc_t,
+         typename loader_t,
+         typename storer_t>
+void launch_jitted_unrolled_kernel(
+    std::mutex &jiterator_mutex,
+    at::cuda::jit::NvrtcFunction &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    int64_t N,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s,
+    bool contiguous,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void* scalar_val,
+    c10::ArrayRef<void*> extra_args) {
+
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  if (!fn_cache.function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_cache.function) {
+      constexpr bool dynamic_casting = !std::is_same<decltype(l), memory::LoadWithoutCast>() ||
+                                       !std::is_same<decltype(s), memory::StoreWithoutCast>();
+      auto code = at::cuda::jit::generate_code(
+          desc, contiguous, dynamic_casting, scalar_pos);
+      fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name);
+    }
+  }
+
+  auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+  at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u},
+  {num_threads(), 1u, 1u});
+}
+
+template<int arity, typename array_t>
+void launch_jitted_vectorized_kernel(
+    std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void *scalar_val, c10::ArrayRef<void*> extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+  const int vec_size = at::cuda::jit::can_vectorize_up_to(
+      desc, c10::ArrayRef<char*>(data.data, data.size()));
+
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  if (vec_size == 4) {
+    fn_ptr = &fn_cache.vec4;
+  } else if (vec_size == 2) {
+    fn_ptr = &fn_cache.vec2;
+  } else if (vec_size ==1) {
+    fn_ptr = &fn_cache.vec1;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel");
+  }
+
+  bool vectorized = vec_size > 1;
+
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_ptr->function) { // cache miss!
+
+      // Generates program
+      auto code = at::cuda::jit::generate_code(
+          desc, /*contiguous=*/true, /*dynamic_casting=*/false,
+          scalar_pos, vectorized, vec_size);
+      std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name;
+
+      // Acquires the program
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+
+  if (vectorized) {
+    auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  } else {
+// NVCC complains about unused variables l and s.
+// It should be false positive in most cases, so we suppress the warnings.
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
+    auto ic = TrivialOffsetCalculator<arity>();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+
+    auto args = pack_kernel_args(
+        {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+#pragma nv_diagnostic pop
+  }
+}
+
+template <int arity>
+void jitted_gpu_kernel_generic(
+    std::mutex &jiterator_mutex,
+    JittedKernelVariantCache &cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    c10::ArrayRef<void*> extra_args,
+    TensorIteratorBase& iter,
+    const bool dynamic_casting,
+    void *scalar_val) {
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  constexpr int ntensors = arity + 1;
+  at::detail::Array<char*, ntensors> data;
+  for (auto i : c10::irange(ntensors)) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel<arity>(
+          jiterator_mutex, cache.vec, desc,
+          numel, data, scalar_pos, scalar_val, extra_args);
+      return;
+    }
+
+    // Case 2: no dynamic casting and noncontiguous
+    auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.noncontiguous, desc, numel, data,
+        input_offset_calculator, output_offset_calculator, loader,
+        storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast<1>(iter);
+
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  auto loader = memory::LoadWithCast<arity>(iter);
+
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    auto input_offset_calculator = TrivialOffsetCalculator<arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator,
+        output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Case 4: dynamic casting and noncontiguous
+  auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  launch_jitted_unrolled_kernel(
+      jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+}
+
+// NOTE: static to reduce chances of name collision.
+template <
+    char const* name,
+    typename result_type,
+    typename f_inputs_type,
+    int arity,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    typename... ExtraArgs>
+static void jitted_gpu_kernel_impl(
+    TensorIteratorBase& iter,
+    const std::string &f,
+    const bool dynamic_casting,
+    at::opmath_type<f_inputs_type> scalar_val,
+    std::tuple<ExtraArgs...> extra_args) {
+
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+  static std::mutex jiterator_mutex;
+  static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
+
+  constexpr int nInputs = arity;
+  constexpr int nOutputs = 1;  // TODO: Support more than 1 output
+  static const auto desc = at::cuda::jit::make_kernel_descriptor<
+    result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs);
+
+  auto &cache = device_caches[iter.device().index()];
+  auto extra_args_array = tuple_to_array(extra_args);
+  return jitted_gpu_kernel_generic<arity>(
+      jiterator_mutex,
+      cache,
+      desc,
+      scalar_pos,
+      extra_args_array,
+      iter,
+      dynamic_casting,
+      &scalar_val
+    );
+}
+
+}}  // at::native
+
+#endif // AT_USE_JITERATOR()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..aace9c30b0a7e9d08de71c4baf1490d45ff6d36e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+}}  // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f0a620e27fcd5d55520fa672beb34faf8eaa5510
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh
@@ -0,0 +1,384 @@
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <c10/core/DynamicCast.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
+// References:
+// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
+
+namespace at { namespace native { namespace memory {
+
+namespace detail {
+
+// What does the `static_unroll` do?
+//
+// We want to do something like:
+//
+//    using args_t = typename traits::ArgsTuple;
+//    args_t args;
+//    #pragma unroll
+//    for (int i = 0; i < traits::arity; i++) {
+//      std::get<i>(args) = ....
+//    }
+//
+// but unfortunately the above code does not work because
+// the template argument has to be a compile time constant
+// so `static_unroll` is created to simulate `#pragma unroll`
+// using template metaprogramming.
+
+template<template<int i> typename func, int end, int current=0>
+struct static_unroll {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current+1>::with_args(args...);
+  }
+};
+
+template<template<int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+
+// helper structs to be used with static_unroll to load arguments
+// one by one
+
+template<int arg_index>
+struct vectorized_load_helper {
+  template <typename args_t, typename policy_t>
+  static __device__ void apply(policy_t &self, args_t *args, int idx) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
+    auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
+    self.load_single_arg(args_accessor, ptr);
+  }
+};
+
+template<int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+
+template <int current>
+struct multi_outputs_store_helper {
+  template<int ntensors, int num_outputs, typename ...Args>
+  C10_HOST_DEVICE static void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      thrust::tuple<Args...> ret) {
+    using T = typename thrust::tuple_element<current, thrust::tuple<Args...>>::type;
+    T *to = reinterpret_cast<T *>(data[current]) + offsets[current];
+    *to = thrust::get<current>(ret);
+  }
+};
+
+}  // namespace detail
+
+struct LoadWithoutCast {
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t *>(base_ptr) + offset);
+  }
+};
+
+template <int N>
+struct LoadWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  LoadWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.ninputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i + iter.noutputs());
+      element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs()));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    return c10::fetch_and_cast<scalar_t>(dtypes[arg], ptr);
+  }
+};
+
+struct StoreWithoutCast {
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+};
+
+template <int N = 1>
+struct StoreWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  StoreWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.noutputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i);
+      element_sizes[i] = c10::elementSize(iter.dtype(i));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    c10::cast_and_store<scalar_t>(dtypes[arg], ptr, value);
+  }
+};
+
+// aligned vector generates vectorized load/store on CUDA
+template<typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+  scalar_t val[vec_size];
+};
+
+template <int vec_size, typename scalar_t>
+__device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
+  using vec_t = aligned_vector<scalar_t, vec_size>;
+  auto *from = reinterpret_cast<const vec_t *>(base_ptr);
+  return from[offset];
+}
+
+template <int vec_size>
+__device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint32_t offset) {
+  // See NOTE [Loading boolean values]
+  auto tmp = load_vector<vec_size>(reinterpret_cast<const uint8_t*>(base_ptr), offset);
+  aligned_vector<bool, vec_size> ret;
+  for (int i = 0; i < vec_size; ++i) {
+    ret.val[i] = bool(tmp.val[i]);
+  }
+  return ret;
+}
+
+namespace policies {
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int num_outputs = 1>
+struct unroll {
+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  loader_t loader;
+  storer_t storer;
+
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      storer.store(from[i], data[0], offset);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+// Note:
+// Functions in vectorized policy does not do boundary check. It assumes the whole block
+// has its job to do. So the reminders should be handled by the caller manually.
+template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
+struct vectorized {
+
+  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size() / vec_size;
+
+  data_t data;
+
+  __device__ vectorized(data_t data) : data(data) {}
+
+  __device__ inline constexpr bool check_inbounds(int thread_work_elem) {
+    return true;
+  }
+
+  template<typename accessor_t, typename scalar_t>
+  __device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      auto v = load_vector<vec_size>(from, index);
+      #pragma unroll
+      for (int j = 0; j < vec_size; j++) {
+        to(vec_size * i + j) = v.val[j];
+      }
+    }
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    using vec_t = aligned_vector<scalar_t, vec_size>;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    vec_t *to_ = reinterpret_cast<vec_t *>(to);
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      vec_t v;
+      for (int j = 0; j < vec_size; j++) {
+        v.val[j] = from[vec_size * i + j];
+      }
+      to_[index] = v;
+    }
+  }
+};
+
+template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
+  //we don't use inheritance because of compiler bug in cuda 10.2+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+
+  __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
+  data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+
+  template <typename return_t>
+  __device__ inline void store(return_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= this->remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+}  // namespace policies
+
+// This is only used in host, but we will wrap this into some templates
+// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
+// in order to compile
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
+  constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
+  if (address % vec4_alignment == 0) {
+    return 4;
+  } else if (address % vec2_alignment == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template<int i>
+struct can_vectorize_up_to_helper {
+  template <typename array_t, typename traits>
+  static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
+    using arg_t = typename traits::template arg<i>::type;
+    // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    result = std::min<int>(result, can_vectorize_up_to<arg_t>(pointers[i + 1]));
+  }
+};
+
+template<typename func_t, typename array_t>
+inline int can_vectorize_up_to(array_t pointers) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  constexpr int arity = traits::arity;
+  int result = can_vectorize_up_to<return_t>(pointers[0]);
+  // We need to get the type for each argument of `func_t`, this can only
+  // be done at compile time.
+  detail::static_unroll<can_vectorize_up_to_helper, arity>::with_args(result, pointers, traits());
+  return result;
+}
+
+}}} // namespace at::native::memory
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..17f14444abd14a03de30f57d3be7254f51a957f9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh
@@ -0,0 +1,379 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <vector>
+
+namespace at::native {
+
+namespace {
+
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+
+// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
+static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
+    72,
+    60};
+
+template <typename T>
+__device__ __forceinline__ bool is_aligned(T* p) {
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+
+template <typename T>
+__device__ __forceinline__ void load_store(
+    T* dst,
+    T* src,
+    int64_t dst_offset,
+    int64_t src_offset) {
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template <int n>
+struct TensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename scalar_vals_t, int n>
+struct TensorListScalarListMetadata {
+  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+};
+
+// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
+// 4kb with `c10::complex<double>`
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 1> {
+  const void* addresses[1]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
+  int block_to_chunk[depth_to_max_blocks[1 - 1]];
+};
+
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 2> {
+  const void* addresses[2]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
+  int block_to_chunk[depth_to_max_blocks[2 - 1]];
+};
+
+// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
+// whose each element is `at::Tensor` of 1 element representing the number of
+// `step`s called so far.
+template <int n>
+struct FusedOptimizerTensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(kChunkSize, tensorListMeta, args...);
+}
+
+} // namespace
+
+// multi_tensor_apply enables horizontal fusion across lists of tensors.
+// For example, whereas you once had a for-loop of a + b = c, where a, b,
+// and c are individual tensors in lists as, bs, and cs, you can now with
+// fewer kernel launches compute as + bs = cs.
+//
+// You can also imagine bs to be a scalar list vs a tensor list.
+//
+// The function below takes in tensor lists, scalars, and a callable and
+// chunks up the computation to launch as few kernels as possible by iterating
+// through every "chunk" in every tensor (thus the nested for loops). In the
+// simplest case, everything gets bundled into just one kernel launch, but
+// due to blocksize constraints, we may need to launch multiple kernels.
+// Each kernel launch is defined by one tensorListMeta construct, which we
+// use to track and reset the necessary metadata for each launch.
+template <int depth, typename scalar_T, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<Scalar> scalars,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  using scalar_vals_t = typename T::opmath_t;
+  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // now we enter [chunking territory].
+    // we will launch a kernel when EITHER the blocks get filled up OR
+    // the tensors get filled up. There will always be at least one block
+    // per tensor since the zero-sized ones will not enter the loop, so
+    // the nested forloop within represents iterating through the chunks
+    // of a single tensor.
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      // a tensor is not considered full unless all its chunks have been
+      // processed
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        // all chunks have already been handled in the kernel
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else { // blocks were full and tensor chunks remain
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.scalar_vals[0] =
+              tensorListMeta.scalar_vals[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // note: [finishing what we started]
+  // if there's remaining work to be done but the tensors/blocks aren't full
+  // yet we are at the end, submit the kernel to do the work!
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  TensorListMetadata<depth> tensorListMeta;
+  tensorListMeta.start_tensor_this_launch = 0;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see note: [chunking territory].
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+          tensorListMeta.start_tensor_this_launch = t + 1;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+          tensorListMeta.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+
+  // see note: [finishing what we started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply_for_fused_optimizer(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::TensorList state_steps,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth");
+  const auto num_tensors = tensor_lists[0].size();
+  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.state_steps_addresses[loc_tensor_info] =
+        state_steps[tensor_index].const_data_ptr();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][tensor_index].numel();
+    for (const auto& d : c10::irange(depth)) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][tensor_index].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see above note: [chunking territory]
+    const auto numel = tensor_lists[0][tensor_index].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    TORCH_CHECK(chunks > -1);
+    for (const auto& chunk : c10::irange(chunks)) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const auto tensor_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
+
+      if (tensor_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.state_steps_addresses[0] =
+              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
+          for (const auto& d : c10::irange(depth)) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // see above note: [finishing what we've started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h
new file mode 100644
index 0000000000000000000000000000000000000000..569b145fa61d991472f589a777b2f74b4a277857
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/native/ResizeCommon.h>
+
+#include <c10/cuda/CUDAGuard.h>
+
+namespace at { namespace native {
+
+TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes);
+
+static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+
+  const Storage &storage = self->unsafe_storage();
+  TORCH_CHECK(storage, "Tensor: invalid null storage");
+  if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+
+inline TensorImpl* resize_impl_cuda_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool device_guard = true) {
+  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+    return self;
+  }
+
+  // NB: We don't need to hold the device guard when calling from TH
+  cuda::OptionalCUDAGuard guard;
+  if (device_guard) {
+    guard.set_index(self->storage().device().index());
+  }
+
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
+  if (stride) {
+    self->set_sizes_and_strides(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
+  } else {
+    self->set_sizes_contiguous(size);
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
+  }
+  maybe_resize_storage_cuda(self, storage_size);
+
+  return self;
+}
+
+}}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..656b4ce2c2bbac167457f31e8f554a5e409a2940
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <cstdint>
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/cuda/SortStable.h>
+
+namespace at {
+namespace native {
+
+inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
+  return self.size(dim) <= 4096;
+}
+
+void sortKeyValueInplace(
+    const TensorBase &key, const TensorBase &value, int dim,
+    bool descending, bool stable=false);
+
+}}  // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..27cc91b152968bcbd1ecdf65da40e4a15494da28
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e31f877227c32fd85de677e3b4b3331b322d1a6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..451545117b5133ea8d7c94e8ce8e7ce0920fd2f1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ebe9732346bf4e69b66cb514747af99e4ab249e6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a38edb538256d6102a4a611e6c9582ddae1901d2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
@@ -0,0 +1,680 @@
+namespace at {
+namespace cuda {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+  #if defined(__clang__) && defined(__HIP__)
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  // until ROCm support for kernel asserts is restored
+  #define assert(expr) (static_cast<void>(0))
+  #endif
+
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+  #if defined(__clang__) && defined(__HIP__)
+    return __shfl_down(value, delta, width);
+  #else
+    return __shfl_down_sync(mask, value, delta, width);
+  #endif
+  }
+
+
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+  #if defined(__clang__) && defined(__HIP__)
+        __shfl_down(value.real(), delta, width),
+        __shfl_down(value.imag(), delta, width));
+  #else
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  #endif
+  }
+  #endif
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+
+
+
+
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+
+  };
+
+
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+
+  inline __device__ ${functor}
+
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+
+
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    scalar_t values[input_vec_size];
+
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+
+  const std::string reduction_template_1 = R"ESCAPE(
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+
+
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+
+}}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..651053d663e4c204753cdfa4ae31ed60fed34152
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <c10/macros/Macros.h>
+
+// Marks a lambda as executable on both the host and device. The __host__
+// attribute is important so that we can access static type information from
+// the host, even if the function is typically only executed on the device.
+#ifndef GPU_LAMBDA
+#define GPU_LAMBDA __host__ __device__
+#endif
+
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 256;
+}
+#else
+constexpr uint32_t num_threads() {
+  return C10_WARP_SIZE * 4;
+}
+#endif
+
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8384f4391f9a854c45f625b171dc28d6e2400e4a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h
@@ -0,0 +1,394 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/TensorFactory.h>
+#include <c10/util/Optional.h>
+#include <c10/core/ScalarType.h>
+#include <torch/library.h>
+#include <exception>
+#include <unordered_map>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+// Fwd declarations
+namespace at {
+  struct TensorIteratorBase;
+}
+using namespace at::mps;
+
+namespace at::native::mps {
+
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+
+struct MPSScalar {
+  id<MTLBuffer> getMTLBuffer() const { return __builtin_bit_cast(id<MTLBuffer>, buffer.get()); }
+
+  size_t size = 0;
+  ScalarType type = ScalarType::Undefined;
+  c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope)
+  union {
+    float f; // MPS doesn't support 'double'
+    at::Half h;
+    int64_t i;
+    bool b;
+    c10::complex<float> cf;
+    c10::complex<at::Half> ch;
+    at::BFloat16 bf16;
+  } value {};
+};
+
+void runMPSGraph(MPSStream* mpsStream,
+    MPSGraph* mpsGraph,
+    NSDictionary* feeds,
+    NSDictionary* results);
+
+MPSDataType getMPSDataType(ScalarType scalar_type);
+static inline MPSDataType getMPSDataType(const Tensor& t) {
+  return getMPSDataType(t.scalar_type());
+}
+MPSDataType getMPSScalarType(ScalarType scalar_type);
+static inline MPSDataType getMPSScalarType(const Tensor& t) {
+  return getMPSScalarType(t.scalar_type());
+}
+MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
+std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false);
+static inline std::string getMPSTypeString(const Tensor& t, bool short_name = false) {
+  return getMPSTypeString(t.scalar_type(), short_name);
+}
+std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
+NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
+std::string getMPSShapeString(MPSShape* shape);
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true);
+std::string getArrayRefString(const IntArrayRef s);
+// use has_storage() on the returned tensor to determine if src actually is a view
+Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+
+// The MPSShape could vary based on memory format
+MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+
+static inline id<MTLBuffer> getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+
+class Placeholder {
+ public:
+  Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr,
+              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
+  MPSGraphTensor* getMPSGraphTensor() {
+    return _placeholder;
+  }
+  MPSGraphTensorData* getMPSGraphTensorData() {
+    return _value;
+  }
+  bool isIntermediate() {
+    return _value == nullptr;
+  }
+
+ private:
+  MPSGraphTensor* _placeholder;
+  MPSGraphTensorData* _value;
+  Tensor _tensor;
+};
+
+void resize_tensor(Tensor* output);
+Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device);
+MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, MPSDataType toType);
+MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
+
+MPSGraph* make_mps_graph();
+void printTensorNDArray(const Tensor& t);
+MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType);
+
+MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar);
+
+string get_mem_format_string(c10::MemoryFormat memory_format);
+
+using MPSCacheKey = uint64_t;
+
+// derive this class to cache a graph and its inputs/outputs
+// can be used to store any NSObject
+struct MPSCachedGraph
+{
+  MPSCachedGraph(NSObject *object) : _object([object retain]) {}
+  virtual ~MPSCachedGraph() {
+   [_object release];
+   _object = nullptr;
+  }
+
+  template<typename T>
+  inline T* as() {
+    return static_cast<T*>(this);
+  }
+
+  MPSGraph *graph() const { return (MPSGraph *)_object; }
+  NSObject *object() const { return _object; }
+private:
+  NSObject *_object = nullptr;
+};
+
+struct MPSUnaryCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+
+struct MPSUnaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil; // some backward input is actually the forward's output
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+
+struct MPSBinaryCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+
+struct MPSBinaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+
+// TODO: Improve the overall design of MPSGraphCache.
+// https://github.com/pytorch/pytorch/issues/77176
+// Cache holding various keys mapped to graphs
+struct MPSGraphCache
+{
+  typedef MPSCachedGraph * (^CreateCachedGraphBlock)();
+
+  struct CacheEntry {
+    CacheEntry(const std::string& key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {}
+    MPSCachedGraph* cachedGraph_ = nullptr;
+    std::string key_;
+  };
+
+ public:
+
+  static MPSGraphCache* getInstance() {
+    if(_instance_cache == nullptr) {
+      _instance_cache = new MPSGraphCache();
+    }
+    return _instance_cache;
+  }
+
+  ~MPSGraphCache() {
+    dispatch_release(serialQueue_);
+
+    for (const auto& i : cache_) {
+      delete i.second.cachedGraph_;
+    }
+  }
+
+  // Disallow the copy constructor and operator= functions
+  MPSGraphCache(const MPSGraphCache&) = delete;
+  void operator=(const MPSGraphCache&) = delete;
+
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+
+    __block MPSCachedGraph* cachedGraph = nil;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync_with_rethrow(serialQueue_, ^() {
+      // verify the cached entry doesn't already exist
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+      } else {
+        cachedGraph = createCacheBlock();
+        CacheEntry entry(key, cachedGraph);
+        cache_.emplace(hash, entry);
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+
+  template<typename T>
+  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock));
+  }
+
+  MPSCachedGraph* LookUp(const std::string& key) const {
+
+    __block MPSCachedGraph* cachedGraph = nullptr;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync(serialQueue_, ^() {
+
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+
+  template<typename T>
+  inline T* LookUpAs(const std::string& key) const {
+    return static_cast<T *>(LookUp(key));
+  }
+
+ private:
+  MPSGraphCache() {
+    serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
+  }
+  // this is defined in OperationUtils.mm to not include
+  // MPSProfiler.h in header OperationUtils.h
+  void profileCachedGraph(const CacheEntry& cacheEntry) const;
+
+  static MPSGraphCache* _instance_cache;
+  std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+  dispatch_queue_t serialQueue_ = nullptr;
+
+};
+
+// Common template for creating graph with a specified cache if missing
+template<typename T>
+inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(MPSGraph*, T*)> instantiate) {
+  auto cache_ = MPSGraphCache::getInstance();
+  if (auto rc  = cache_->LookUpAs<T>(key)) {
+    return rc;
+  }
+  return cache_->CreateCachedGraphAs<T>(key, ^mps::MPSCachedGraph*() {
+    T* newCachedGraph = nil;
+    @autoreleasepool {
+      // Initialize graph
+      auto mpsGraph = mps::make_mps_graph();
+      newCachedGraph = new T(mpsGraph);
+      instantiate(mpsGraph, newCachedGraph);
+    }
+    return newCachedGraph;
+  });
+}
+
+// Common math operations
+MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+
+#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name)                                           \
+  if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) {                                                       \
+     TORCH_WARN_ONCE("MPS: no support for int64 for ", op_name,                                                         \
+     ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3.");   \
+  }
+
+/**
+ * Returns distance from lowest to highest element offset in given tensor.
+ */
+size_t compute_storage_numel_distance(const at::Tensor& t);
+
+/**
+ * Checks whether tensor is mapped to a contiguous area in the storage.
+ */
+inline bool is_dense_in_storage(const at::Tensor& t) {
+  return compute_storage_numel_distance(t) == static_cast<size_t>(t.numel());
+}
+
+static inline void mtl_setBuffer(id<MTLComputeCommandEncoder> encoder, const Tensor& t, unsigned idx) {
+  [encoder setBuffer:getMTLBufferStorage(t)
+              offset:t.storage_offset() * t.element_size()
+             atIndex:idx];
+}
+
+static inline void mtl_dispatch1DJob(id<MTLComputeCommandEncoder> encoder,
+                                     id<MTLComputePipelineState> cplState,
+                                     uint32_t length) {
+  const uint32_t maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup];
+  auto size = MTLSizeMake(length, 1, 1);
+  auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1);
+  [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize];
+}
+
+id<MTLBuffer> generateKernelDataOffsets(id<MTLComputeCommandEncoder> commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false);
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) {
+        return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+                p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(),
+         };
+}
+
+inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) {
+        runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
+}
+
+inline bool supportsComplex() {
+  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+}
+
+// MPS yet to support double types, but starting from MacOS 14, supports bfloat16
+inline bool supportedFloatingType(ScalarType dtype) {
+  return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
+}
+
+inline bool supportedFloatingType(const Tensor& t) {
+  return supportedFloatingType(t.scalar_type());
+}
+
+} // namespace at::native::mps
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6c9da0babbbedc71e41820aabf7c1c71274bd44
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h
@@ -0,0 +1,12 @@
+//  Copyright © 2022 Apple Inc.
+
+#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(                                                   \
+      TYPE, NAME,                                                       \
+      AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)                \
+      AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__))
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..cee721d7bc8f6e560b8ad9ef2e995fc23708b25f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h
@@ -0,0 +1,103 @@
+/**
+ * Transformer-specific NestedTensor utility functions.
+ *
+ * Not co-located with NestedTensor core code yet because they only
+ * support specific cases needed in transformers.
+ */
+#pragma once
+
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+} // namespace c10
+
+namespace at {
+class Tensor;
+namespace native {
+struct NestedTensorImpl;
+
+// Requires that self is a contiguous NestedTensor, other is not a
+// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self
+// must have a consistent last dimension across its included Tensors
+// and that dimension must match other.size(0).
+Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other);
+
+// Requires that mat1 is a contiguous NestedTensor, self & mat2 are
+// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1
+// has a consistent last dimension across its included Tensors that
+// matches mat2.size(0).
+Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::Scalar& beta,
+    const c10::Scalar& alpha,
+    c10::optional<bool> use_gelu = c10::nullopt);
+
+Tensor NestedTensor_add_NestedTensor_in_place(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor(
+    const Tensor& sizes,
+    int64_t extra_elements);
+
+Tensor NestedTensor_from_padded_tensor_cpu(
+    const Tensor& padded,
+    const NestedTensorImpl& nt);
+
+Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length);
+
+template <typename T>
+void remove_padding_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void remove_padding_transform0213_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+
+TORCH_API Tensor flash_attention_helper(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+
+TORCH_API std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ff342a643c38590255d68d10133f171c176f836
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantize_tensor_per_tensor_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& quantize_tensor_per_channel_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+Tensor& quantize_tensor_per_channel_float_qparams(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+Tensor& dequantize_tensor_per_tensor_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& dequantize_tensor_per_channel_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& dequantize_tensor_per_channel_float_qparams(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point);
+
+using quantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point);
+
+using dequantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point);
+
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_fn,
+    quantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_affine_fn,
+    quantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_float_qparams_fn,
+    quantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_fn,
+    dequantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_affine_fn,
+    dequantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_float_qparams_fn,
+    dequantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
+
+template <typename T>
+TORCH_API Tensor quantize_tensor(
+    Tensor rtensor,
+    Tensor qtensor,
+    double scale,
+    int64_t zero_point);
+template <typename T>
+TORCH_API Tensor dequantize_tensor(
+    Tensor qtensor,
+    Tensor rtensor,
+    double scale,
+    int64_t zero_point);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f8ff918c1d2f3e421922650161aaa41eda9545f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/native/ConvUtils.h>
+
+namespace at::native::quantized {
+namespace {
+// MakeConvOutputShape used from both CPU and CUDA libraries
+// and exporting symbol from torch_cpu would probably take more storage
+// than duplicating implementation which likely be inlined away
+template <int kSpatialDim>
+at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, kSpatialDim>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation);
+
+#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK)
+template <>
+at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 2>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const at::List<int64_t>& dilation) {
+  const int H = input_image_shape[0];
+  const int W = input_image_shape[1];
+  const int64_t Y_H =
+      (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  return {N, M, Y_H, Y_W};
+}
+
+template <>
+at::SmallVector<int64_t, 5> MakeConvOutputShape<3>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 3>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation) {
+  const int D = input_image_shape[0];
+  const int H = input_image_shape[1];
+  const int W = input_image_shape[2];
+  const int64_t Y_D =
+      (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_H =
+      (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1;
+  return {N, M, Y_D, Y_H, Y_W};
+}
+
+#endif
+} // anonymous namespace
+} // namespace at::native::quantized
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e240b5a8e9afc61f8828f4162f1b89c7ec06bb7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/TensorIterator.h>
+
+namespace at {
+namespace native {
+using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point);
+using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point);
+
+DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub);
+DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub);
+
+
+} // native
+} // at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..a442628573fecd18da0afdebc10dba2bd9faf8b1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+
+  // out variant of LinearPackedParamsBase::apply
+  virtual at::Tensor& apply_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual at::Tensor& apply_relu_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    input: float32 Tensor, will be quantized to quint8 in the op
+  // Returns:
+  //    float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  virtual at::Tensor apply_dynamic(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor apply_dynamic_relu(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+
+  virtual at::Tensor& apply_dynamic_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual c10::optional<at::Tensor> bias() = 0;
+
+  virtual void set_bias(c10::optional<at::Tensor> /*bias*/) {
+    throw std::runtime_error(
+        "set_bias is not implemented for this packed "
+        "parameter type");
+  }
+};
+
+template <int kSpatialDim = 2>
+struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) = 0;
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual torch::List<int64_t> stride() const = 0;
+  virtual torch::List<int64_t> padding() const = 0;
+  virtual torch::List<int64_t> output_padding() const = 0;
+  virtual torch::List<int64_t> dilation() const = 0;
+  virtual int64_t groups() const = 0;
+  virtual bool transpose() const = 0;
+};
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..140b716df2691db2fcc6bab0d52ad380ef10067e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor embeddingbag_byte(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor embeddingbag_4bit(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor unpack() = 0;
+
+  virtual int64_t bit_rate() const = 0;
+  virtual int64_t version() const = 0;
+};
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..88ff258be891fb320cfdc98ebb1d5363c055d631
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -0,0 +1,527 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <pytorch_qnnpack.h>
+#include <qnnpack_func.h>
+#include <ATen/native/quantized/cpu/XnnpackUtils.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/utils/Factory.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#include <utility>
+inline int kPaddingChannels = 8;
+struct QnnpackOperatorDeleter {
+  void operator()(pytorch_qnnp_operator_t op) {
+    pytorch_qnnp_delete_operator(op);
+  }
+};
+
+// PackedWeight struct for QNNPACK stores the original Weight and Bias as
+// QNNPACK currently does not support an unpack function.
+// For PyTorch Mobile, once the model is scripted and serialized we don't need
+// to call unpack, so we can save some memory by checking for this case and free
+// the original weights after packing.
+// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized
+// with input scale which is available at runtime in pytorch. During runtime if
+// input scale value changes then we requantize bias with the updated scale. For
+// inference we expect the graph to be static so the input scale should not
+// change across consecutive inference calls.
+struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
+  PackedLinearWeightsQnnp(
+      std::unique_ptr<qnnpack::PackBMatrix> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      c10::optional<double> input_scale,
+      at::Tensor w_scales,
+      std::vector<uint8_t>&& w_zps)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
+            bias, bias.suggest_memory_format())),
+        per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
+        input_scale(std::move(input_scale)),
+        w_scales(std::move(w_scales)),
+        w_zero_points(std::move(w_zps)),
+        q_scheme(this->orig_weight.qscheme()) {
+    weight_sizes = this->orig_weight.sizes().vec();
+  }
+
+  std::unique_ptr<qnnpack::PackBMatrix> w;
+  at::Tensor orig_weight;
+  at::Tensor bias_;
+  bool per_channel_;
+  c10::optional<double> input_scale;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  std::vector<int64_t> weight_sizes;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+  bool per_channel() const {
+    return per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+
+#ifdef USE_XNNPACK
+  xnnpack_operator xnnp_linear_op;
+
+  template <typename scalar_t, bool kReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range);
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsQnnp(
+      std::unique_ptr<qnnpack::PrePackConvWeights> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose,
+      c10::optional<double> input_scale,
+      std::vector<int64_t> kernel,
+      at::Tensor w_scale,
+      std::vector<uint8_t>&& w_zps,
+      bool is_per_channel)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        is_per_channel_(is_per_channel),
+        input_scale(input_scale),
+        kernel_(std::move(kernel)),
+        w_scales(std::move(w_scale)),
+        w_zero_points(std::move(w_zps)) {
+    const bool any_padding = std::any_of(
+        padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; });
+    const size_t kernel_size =
+        std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>());
+
+    const size_t group_input_channels = transpose
+        ? this->orig_weight.size(0) / groups
+        : this->orig_weight.size(1);
+    const size_t group_output_channels = transpose
+        ? this->orig_weight.size(1)
+        : this->orig_weight.size(0) / groups;
+
+    const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1;
+    const size_t kernel_height = kernel_[kSpatialDim - 2];
+    const size_t kernel_width = kernel_[kSpatialDim - 1];
+
+    pytorch_qnnp_ukernel_type ukernel_type;
+    if (transpose_) {
+      ukernel_type = pytorch_qnnp_ukernel_type_conv;
+    } else {
+      ukernel_type = pytorch_qnnp_ukernel_type_none;
+
+      const bool has_depthwise_dimensions =
+          (kSpatialDim == 2 &&
+           ((kernel_height == 3 && kernel_width == 3) ||
+            (kernel_height == 5 && kernel_width == 5))) ||
+          (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 &&
+           kernel_depth == 3);
+      const bool has_depthwise_grouping =
+          group_input_channels == 1 && group_output_channels == 1 && groups > 1;
+
+      if (has_depthwise_dimensions && has_depthwise_grouping) {
+        ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
+      } else if (
+          kernel_size == 1 &&
+          std::all_of(
+              stride_.begin(),
+              stride_.end(),
+              [](const auto& e) { return e == 1; }) &&
+          !any_padding) {
+        ukernel_type = group_input_channels >= SIZE_MAX
+            ? pytorch_qnnp_ukernel_type_xzp_gemm
+            : pytorch_qnnp_ukernel_type_gemm;
+      } else {
+        ukernel_type = pytorch_qnnp_ukernel_type_conv;
+      }
+    }
+
+    if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
+      TORCH_INTERNAL_ASSERT(
+          false, "Per channel quantized weights are not supported for XZP kernels");
+    }
+
+    pytorch_qnnp_operator_t convolution{nullptr};
+    // Initially all the params are set to zero.
+    convolution = static_cast<pytorch_qnnp_operator_t>(
+        calloc(1, sizeof(struct pytorch_qnnp_operator)));
+    if (convolution == nullptr) {
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+          sizeof(struct pytorch_qnnp_operator));
+    }
+
+    convolution_op =
+        std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>(
+            convolution);
+
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    convolution->ukernel_type = ukernel_type;
+    convolution->groups = groups;
+    convolution->group_input_channels = group_input_channels;
+    convolution->group_output_channels = group_output_channels;
+    convolution->kernel_depth = kernel_depth;
+    convolution->kernel_height = kernel_height;
+    convolution->kernel_width = kernel_width;
+    convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1;
+    convolution->stride_height = stride_[kSpatialDim - 2];
+    convolution->stride_width = stride_[kSpatialDim - 1];
+    convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
+    convolution->dilation_height = dilation_[kSpatialDim - 2];
+    convolution->dilation_width = dilation_[kSpatialDim - 1];
+    convolution->input_padding_height = padding_[kSpatialDim - 2];
+    convolution->input_padding_width = padding_[kSpatialDim - 1];
+    convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
+    convolution->per_channel = is_per_channel_;
+    convolution->transpose = transpose_;
+
+    const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
+    const size_t k_stride = (group_input_channels + (kr - 1)) & -kr;
+
+    size_t zero_size = sizeof(uint8_t) * k_stride;
+    size_t zero_offset = 0;
+
+    if (transpose_) {
+      convolution->adjustment_width = output_padding_[1];
+      convolution->adjustment_height = output_padding_[0];
+      if (group_input_channels < 8) {
+        zero_size += 8;
+        zero_offset = 8;
+      }
+    } else {
+      zero_buffer_size = 0;
+      if (any_padding) {
+        zero_size = 0;
+        zero_offset = 0;
+        if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) {
+          const uint32_t cr = pytorch_qnnp_params.q8dw9.cr;
+          const size_t group_stride = (groups + (cr - 1)) & -cr;
+          if (groups >= 8) {
+            zero_size = sizeof(uint8_t) * group_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * group_stride + 8;
+            zero_offset = sizeof(uint8_t) * 8;
+          }
+        } else if (
+            ukernel_type == pytorch_qnnp_ukernel_type_conv ||
+            ukernel_type == pytorch_qnnp_ukernel_type_gemm) {
+          if (group_input_channels >= 8) {
+            zero_size = sizeof(uint8_t) * k_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * k_stride + 8;
+            zero_offset = 8;
+          }
+        }
+      }
+    }
+
+    // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI)
+    void* zero_buffer = malloc(zero_size);
+    if (zero_buffer == nullptr) {
+      pytorch_qnnp_delete_operator(convolution);
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for zero padding",
+          zero_size);
+    }
+    // Need to set to input zero point
+    // memset(zero_buffer, input_zero_point, zero_size);
+    zero_buffer_size = zero_size;
+    convolution->zero_buffer = zero_buffer;
+    convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
+  }
+
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
+  #ifdef USE_XNNPACK
+  xnnpack_operator xnnp_convolution_op;
+  #endif  // USE_XNNPACK
+  std::unique_ptr<qnnpack::PrePackConvWeights> w;
+  at::Tensor orig_weight;
+  at::Tensor bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+  bool is_per_channel_;
+  c10::optional<double> input_scale;
+  std::vector<int64_t> kernel_;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  size_t zero_buffer_size;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+  bool per_channel() const {
+    return is_per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+
+#ifdef USE_XNNPACK
+  template <typename scalar_t, bool ReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+};
+
+enum class Activation : uint8_t { NONE = 0, RELU = 1 };
+
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+  return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+  return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+  return std::nearbyint(x);
+}
+#endif
+
+template<typename T>
+inline T QuantizeValue(float scale, int32_t zero_point, float value) {
+  const int32_t qmin = std::numeric_limits<T>::min();
+  const int32_t qmax = std::numeric_limits<T>::max();
+  auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<T>(r);
+}
+
+template<typename T>
+inline std::pair<T, T> activationLimits(
+    float scale,
+    int32_t zero_point,
+    Activation Ac) {
+  switch (Ac) {
+    case Activation::NONE:
+      return {std::numeric_limits<T>::min(),
+              std::numeric_limits<T>::max()};
+    case Activation::RELU:
+      return {QuantizeValue<T>(scale, zero_point, 0.0),
+              std::numeric_limits<T>::max()};
+    default:
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+  }
+}
+
+namespace at {
+namespace native {
+namespace qnnp_avgpool_helper {
+Tensor qnnpack_avg_pool2d(
+    Tensor input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+} // qnnp_avgpool_helper
+} // namespace native
+} // namespace at
+
+namespace {
+C10_UNUSED std::vector<float> generate_requantization_scales(
+    const at::Tensor& weight_scales,
+    const float input_scale,
+    const float output_scale,
+    std::vector<float>& requant_scales) {
+  // Since weight scale is allocated with padding
+  // weight_scales.numel() gives us padded num elements.
+  const auto num_output_channels_padded = weight_scales.numel();
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (static_cast<int64_t>(requant_scales.size()) < num_output_channels_padded) {
+    requant_scales.resize(num_output_channels_padded);
+  }
+  for (const auto i : c10::irange(num_output_channels_padded)) {
+    const auto inverse_output_scale = 1.f /output_scale;
+    requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale;
+    TORCH_CHECK(
+        (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])),
+        "failed to create op with requantization scale: ",
+        requant_scales[i],
+        ": requantization scale must be finite and positive");
+  }
+  return requant_scales;
+}
+
+C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+    const at::Tensor& weight_contig,
+    bool transpose = false,
+    uint32_t groups = 1
+  ) {
+  const int out_ch_idx = transpose ? 1 : 0;
+  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
+  // Add 8 to account for bufferring needed by QNNPACK.
+  const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
+  const auto qtype = weight_contig.qscheme();
+  std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
+  // Adjust weight zero point, similar to weight data.
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128);
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong,
+        "Per channel zero points dtype must be long int.");
+    const int64_t* per_channel_zero_points =
+      weight_contig.q_per_channel_zero_points().data_ptr<int64_t>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  at:: Tensor weight_scales =
+    at::empty(
+        {num_output_channels_padded},
+        at::device(at::kCPU).dtype(at::kFloat));
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = weight_contig.q_scale();
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_scales().scalar_type() == at::kDouble,
+        "Per channel scales dtype must be double.");
+    const double *const per_channel_scales =
+      weight_contig.q_per_channel_scales().data_ptr<double>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = static_cast<float>(per_channel_scales[i]);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) {
+    weight_scales_data[i] = 1.f;
+  }
+  return {weight_zp, weight_scales};
+}
+} // namespace
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b026c739786a0b68ccf779f2724c1c4607998e1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/TensorOperators.h>
+#include <c10/util/irange.h>
+#include <algorithm>
+#include <cmath>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/quantize_per_channel_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace quant_utils {
+namespace {
+  float RawUint16ToFp16(unsigned short value) {
+    // Convert raw 16 bits half precision floating point number
+    // to single precision floating point number.
+    const unsigned short sign_bits = value >> 15;
+    const unsigned short exponent_bits = value >> 10 & 0x1f;
+    const unsigned short significand_bits = value & 0x3ff;
+
+    const float sign = sign_bits ? -1 : 1;
+    const float significand =
+        1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10;
+    const float exponent = exponent_bits - 0xf;
+
+    return sign * std::ldexp(significand, exponent);
+}
+
+template <typename T>
+bool CheckAndSaturate(T max_val, T* element) {
+  if (*element > max_val) {
+    *element = max_val;
+    return true;
+  }
+  if (*element < -max_val) {
+    *element = -max_val;
+    return true;
+  }
+  return false;
+}
+}
+using namespace std;
+// A structure to hold quantization parameters 'scale' and 'zero_point'.
+// The meaning of these values is as the constants in the quantization equation
+//
+//   real_value = scale * (quantized_value - zero_point)
+//
+// In other words, 'zero_point' is the quantized value that corresponds
+// to the real value 0, and 'scale' is the difference of real values
+// corresponding to consecutive quantized values.
+struct TensorQuantizationParams {
+  double scale;
+  std::int32_t zero_point;
+  int precision;
+};
+
+// Use fp16_min as the small scale cutoff because we don't want to use scales in
+// fp16 subnormal range. This is to be consistent with Glow and FakeLowP
+// implementation for NNPI.
+constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
+// Following implementation should be identical to fbgemm::ChooseQuantizationParams
+inline TensorQuantizationParams ChooseQuantizationParams(
+    float min,
+    float max,
+    int32_t qmin,
+    int32_t qmax,
+    bool preserve_sparsity = false,
+    bool force_scale_power_of_two = false,
+    bool reduce_range = false) {
+  TORCH_CHECK(
+      min <= max,
+      "In ChooseQuantizationParams, min should be less than or equal to max");
+
+  if (reduce_range) {
+    qmin = qmin/2;
+    qmax = qmax/2;
+  }
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    int symmetric_qmin = -((qmax - qmin) / 2 + 1);
+    int symmetric_qmax = (qmax - qmin) / 2;
+    double max_scale =
+        std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax));
+    min = max_scale * symmetric_qmin;
+    max = max_scale * symmetric_qmax;
+  }
+
+  // We extend the [min, max] interval to ensure that it contains 0.
+  // Otherwise, we would not meet the requirement that 0 be an exactly
+  // representable value.
+  min = std::min(min, 0.f);
+  max = std::max(max, 0.f);
+
+  TORCH_CHECK(
+      qmin < qmax,
+      "In ChooseQuantizationParams, qmin should be less than qmax");
+
+  // Use double precision for intermediate computation but use single precision
+  // in final number to reflect the actual number used during quantization.
+  double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+  // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
+  // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
+  // infinity because some of fbgemm code pre-computes scale's reciprocal to do
+  // multiplication instead of division in the time critical part of code.
+  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+    scale = 0.1;
+  }
+  TORCH_CHECK(scale > 0, "quantization scale should be > 0");
+
+  if (force_scale_power_of_two) {
+    if (scale < 1) {
+      scale = 1.0 / (1 << static_cast<int>(floor(log(1.0 / scale) / log(2))));
+    } else {
+      scale = 1 << static_cast<int>(ceil(log(scale) / log(2)));
+    }
+  }
+
+  // Cut off small scale
+  if (scale < SMALL_SCALE_THRESHOLD) {
+    float org_scale = scale;
+    scale = SMALL_SCALE_THRESHOLD;
+    // Adjust the min and max based on the new scale
+    if (min == 0.0f) {
+      max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else if (max == 0.0f) {
+      min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else {
+      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
+      min *= amplifier;
+      max *= amplifier;
+    }
+  }
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  double zero_point_from_min = qmin - min / static_cast<double>(scale);
+  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min_error =
+      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+  double zero_point_from_max_error =
+      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+  double initial_zero_point =
+      zero_point_from_min_error < zero_point_from_max_error
+      ? zero_point_from_min
+      : zero_point_from_max;
+
+  // for symmetric quantization (preserve_sparsity == true), we force zero_point
+  // to be a middle value between qmin and qmax.
+  // If either min or max is 0, then we just use 0 as zero_point.
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    initial_zero_point = static_cast<double>(qmin + qmax) / 2;
+  }
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with zero
+  // padding).
+  int32_t nudged_zero_point = 0;
+  if (initial_zero_point < qmin) {
+    nudged_zero_point = qmin;
+  } else if (initial_zero_point > qmax) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = nearbyint(initial_zero_point);
+  }
+
+  TensorQuantizationParams result;
+  result.scale = scale;
+  result.zero_point = nudged_zero_point;
+  return result;
+}
+
+// This function helps to convert the Conv1D dimensions usable by the Conv2d op.
+constexpr int64_t kConv1dSqueezeDim = 0;
+static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
+                                             int64_t base_value) {
+  TORCH_CHECK(!arg.empty(), "Argument must have elements.");
+  torch::List<int64_t> result({arg.get(0), base_value});
+  if (arg.size() == 1) {
+    result[1] = arg.get(0);
+  } else {
+    result[1] = arg.get(1);
+  }
+  result[kConv1dSqueezeDim] = base_value;
+  return result;
+}
+
+// The range for using FP16 quantization of weights requires that the elements
+// should be in the range of [5.96e-8, 65504]. If it is out of range, then the
+// number will be saturated to max or min representable values by FP16.
+inline void HandleWeightsSaturation(int64_t N, float* weight) {
+  const float kFp16Max = RawUint16ToFp16(0x7BFF);
+  bool found_out_of_range = false;
+  for (const auto i : c10::irange(N)) {
+    bool saturate = CheckAndSaturate<float>(kFp16Max, weight + i);
+    if (saturate) {
+      found_out_of_range = true;
+    }
+  }
+  if (found_out_of_range) {
+    TORCH_WARN("FOUND weight out of range ");
+  }
+}
+
+// Util function for quantizing bias.
+inline at::Tensor QuantizeBias(
+    bool is_per_channel,
+    const at::Tensor& bias,
+    const at::Tensor& weight_contig,
+    double input_scale) {
+  at::Tensor qbias;
+  if (is_per_channel) {
+    auto bias_quant_scales =
+        weight_contig.q_per_channel_scales() * input_scale;
+    auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+    qbias = at::native::quantize_per_channel(
+        bias, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+  } else {
+    qbias = at::native::quantize_per_tensor(
+        bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
+  }
+  return qbias;
+}
+
+} // namespace quant_utils
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ef8a3f4f4f4215e2e18005341ee9e35c1f78703
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
@@ -0,0 +1,258 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                                const Scalar& /*negval_*/);
+using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */);
+using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point);
+using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qclamp_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& min,
+    const Scalar& max,
+    at::Tensor& /*qy*/);
+using qclamp_minmax_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*min or max*/,
+    at::Tensor& /*qy*/);
+using qthreshold_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& threshold,
+    const Scalar& value,
+    at::Tensor& /*qy*/);
+using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qelu_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*alpha*/,
+    const Scalar& /*scale*/,
+    const Scalar& /*input_scale*/,
+    at::Tensor& /*qy*/);
+using qbinary_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/);
+using qadd_scalar_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/);
+using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qdropout_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*p*/,
+    bool training /*training*/,
+    at::Tensor& /*qy*/);
+using qmaxpool_2d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qmaxpool_3d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iT,
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oT,
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kT,
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sT,
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pT,
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dT,
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qadaptive_avg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideH,
+    int64_t istrideW);
+using qadaptive_avg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeD,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeD,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideD,
+    int64_t istrideH,
+    int64_t istrideW);
+using qavg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+using qavg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t inputDepth,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int64_t outputDepth,
+    int kW,
+    int kH,
+    int kD,
+    int dW,
+    int dH,
+    int dD,
+    int padW,
+    int padH,
+    int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+using qupsample_bilinear2d_fn = void (*)(
+    Tensor& output,
+    const Tensor& input,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t nbatch,
+    int64_t channels,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w);
+
+using qcat_nhwc_fn = Tensor (*)(
+    const MaterializedITensorListRef& qxs,
+    int64_t dim,
+    double scale,
+    int64_t zero_point);
+using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
+
+using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&);
+
+using qnormalize_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qmean_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* opt_dim */,
+    bool /* keepdim */,
+    c10::optional<ScalarType> /* opt_dtype */,
+    Tensor& /* Y */);
+
+using qstd_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* dim */,
+    const c10::optional<Scalar>& /* correction */,
+    bool /* keepdim */,
+    Tensor& /* Y */);
+
+using qnormalize_nhwc_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                           const Tensor& /*qw*/);
+
+DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub);
+DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub);
+DECLARE_DISPATCH(qclamp_fn, qclamp_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub);
+DECLARE_DISPATCH(qelu_fn, qelu_stub);
+DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub);
+DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub);
+DECLARE_DISPATCH(qdropout_fn, qdropout_stub);
+DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub);
+DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub);
+DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub);
+DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub);
+DECLARE_DISPATCH(qrelu_fn, qrelu_stub);
+DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub);
+DECLARE_DISPATCH(qgelu_fn, qgelu_stub);
+DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub);
+DECLARE_DISPATCH(qtanh_fn, qtanh_stub);
+DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub);
+DECLARE_DISPATCH(qtopk_fn, qtopk_stub);
+DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub);
+DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub);
+DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub);
+DECLARE_DISPATCH(qprelu_fn, qprelu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f452a1cc72137beedbbdc0273b7ce87addb4740
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
@@ -0,0 +1,414 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/QnnpackUtils.h>
+#include <ATen/native/quantized/cpu/OnednnUtils.h>
+#include <c10/util/irange.h>
+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+#endif
+
+
+#include <tuple>
+
+/* Convolution prepacked parameters serialization.
+ *
+ * Version 1
+ *
+ * - Fields:
+ *  1. weight
+ *  2. bias
+ *  3. stride x kSpatialDim
+ *  4. padding x kSpatialDim
+ *  5. dilation x kSpatialDim
+ *  6. groups
+ *
+ * Version 2
+ *
+ * - Fields:
+ *  0. version (string)
+ *  1. list of non-optional tensors
+ *    0: packed parameters (int16_t)
+ *      - kSpatialDim
+ *      - stride x kSpatialDim
+ *      - padding x kSpatialDim
+ *      - dilation x kSpatialDim
+ *      - output_padding x kSpatialDim
+ *      - groups
+ *      - transpose (0 or 1)
+ *    1: weight
+ *  2. list of optional tensors
+ *    0: bias
+ *
+ * Version 3
+ *
+ * - Fields:
+ *  0. version (int64_t)
+ *  1. list of int64_t configuration values
+ *    - kSpatialDim
+ *    - stride x kSpatialDim
+ *    - padding x kSpatialDim
+ *    - dilation x kSpatialDim
+ *    - output_padding x kSpatialDim
+ *    - groups
+ *    - flags (bitmask)
+ *      - (1 << 0) transpose (1 = yes)
+ *  2. list of optional tensors
+ *    0: None (helps with type inference)
+ *    1: weight (this must be present)
+ *    2: bias
+ */
+
+using ConvParamsSerializationTypeV2 = std::tuple<
+  // version, for versions 2 and up
+  std::string,
+  // non-optional tensors
+  std::vector<at::Tensor>,
+  // optional tensors
+  std::vector<c10::optional<at::Tensor>>>;
+
+using ConvParamsSerializationTypeV3 = std::tuple<
+  // version, int for versions 3 and up
+  int64_t,
+  // configuration values
+  std::vector<int64_t>,
+  // optional tensors
+  std::vector<c10::optional<at::Tensor>>>;
+
+// Parses any historical conv packed params format into
+// the current format.
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
+
+  // determine the version based on IValue contents
+  int version = -1;
+  if (v.isTuple()) {
+    const auto& elements = v.toTupleRef().elements();
+    if (!elements.empty()) {
+      auto firstElement = elements[0];
+      if (firstElement.isTensor()) {
+        version = 1;
+      } else if (firstElement.isString()) {
+        const std::string& version_str = firstElement.toStringRef();
+        // note: not parsing the string to automatically handle bad
+        // inputs
+        if (version_str == "2") {
+          version = 2;
+        }
+      } else if (firstElement.isInt()) {
+        auto raw_version = firstElement.toInt();
+        if (raw_version == 3) {
+          version = 3;
+        }
+      }
+    }
+  }
+  TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version");
+
+  if (version == 1) {
+    // version 1 - convert to version 3 manually
+
+    const auto& elements = v.toTupleRef().elements();
+
+    at::Tensor weight = elements[0].toTensor();
+    c10::optional<at::Tensor> bias = elements[1].toOptional<at::Tensor>();
+    torch::List<at::Tensor> stride_x_kSpatialDim = elements[2].toTensorList();
+    torch::List<at::Tensor> padding_x_kSpatialDim = elements[3].toTensorList();
+    torch::List<at::Tensor> dilation_x_kSpatialDim = elements[4].toTensorList();
+    at::Tensor groups = elements[5].toTensor();
+
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(
+        stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() +
+        dilation_x_kSpatialDim.size() + kSpatialDim + 3);
+    config_vals.push_back(kSpatialDim);
+    for (const auto i : c10::irange(stride_x_kSpatialDim.size())) {
+      auto stride = stride_x_kSpatialDim.get(i);
+      config_vals.push_back(stride[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(padding_x_kSpatialDim.size())) {
+      auto padding = padding_x_kSpatialDim.get(i);
+      config_vals.push_back(padding[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) {
+      auto dilation = dilation_x_kSpatialDim.get(i);
+      config_vals.push_back(dilation[0].item<int16_t>());
+    }
+    // output_padding does not exist in v1, so we fill in a default value
+    for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+      config_vals.push_back(0);
+    }
+    config_vals.push_back(groups[0].item<int16_t>());
+    // transpose does not exist in v1, so we fill in a default value
+    config_vals.push_back(0);
+
+    std::vector<c10::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 2) {
+    // version 2
+    const auto& elements = v.toTupleRef().elements();
+    std::vector<at::Tensor> non_optional = elements[1].toTensorList().vec();
+    std::vector<c10::optional<at::Tensor>> optional;
+
+    if (elements[2].isTensorList()) {
+      for (const auto& elem : elements[2].toTensorList()) {
+        optional.emplace_back(static_cast<at::Tensor>(elem));
+      }
+    } else {
+      for (const auto& elem : elements[2].toList()) {
+        optional.emplace_back(static_cast<c10::IValue>(elem).toOptional<at::Tensor>());
+      }
+    }
+    // create default optional value for bias
+    if (optional.empty()) {
+      optional.emplace_back();
+    }
+
+    auto config_a = non_optional[0].accessor<int16_t, 1>();
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(config_a.size(0));
+    for (const auto i : c10::irange(config_a.size(0))) {
+      config_vals.emplace_back(config_a[i]);
+    }
+
+    auto weight = non_optional[1];
+    auto bias = optional[0];
+
+    std::vector<c10::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 3) {
+    return v.to<ConvParamsSerializationTypeV3>();
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ",
+        version);
+  }
+}
+
+#define QCONV_SERIALIZATION_VERSION 2
+
+#if QCONV_SERIALIZATION_VERSION == 2
+using ConvParamsSerializationType = ConvParamsSerializationTypeV2;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV2 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+
+  std::string version = "2";
+  std::vector<at::Tensor> non_optional;
+  std::vector<c10::optional<at::Tensor>> optional;
+
+  // create a packed int8_t tensor for conv params
+  std::vector<int16_t> params_vec;
+  params_vec.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  params_vec.insert(params_vec.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  params_vec.insert(params_vec.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  params_vec.insert(params_vec.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  params_vec.insert(params_vec.end(), output_padding.begin(),
+                    output_padding.end());
+  params_vec.push_back(params->groups());
+  params_vec.push_back(params->transpose());
+  int64_t vec_size = params_vec.size();
+  at::Tensor params_tensor = at::from_blob(
+      params_vec.data(), {vec_size},
+      at::TensorOptions().dtype(at::kShort))
+    // clone to retain ownership of the data
+    .clone();
+
+  auto [weight, bias] = params->unpack();
+
+  non_optional.emplace_back(std::move(params_tensor));
+  non_optional.emplace_back(std::move(weight));
+  optional.emplace_back(std::move(bias));
+
+  return std::tie(version, non_optional, optional);
+}
+
+#elif QCONV_SERIALIZATION_VERSION == 3
+using ConvParamsSerializationType = ConvParamsSerializationTypeV3;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+  std::vector<int64_t> config_vals;
+  config_vals.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  config_vals.insert(config_vals.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  config_vals.insert(config_vals.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  config_vals.insert(config_vals.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  config_vals.insert(config_vals.end(), output_padding.begin(),
+                    output_padding.end());
+  config_vals.push_back(params->groups());
+  config_vals.push_back(params->transpose());
+
+  auto [weight, bias] = params->unpack();
+
+  std::vector<c10::optional<at::Tensor>> tensors;
+  tensors.emplace_back();
+  tensors.emplace_back(weight);
+  tensors.emplace_back(bias);
+
+  int64_t version = 3;
+  return std::tie(version, config_vals, tensors);
+}
+
+#else
+#error "Invalid qconv serialization version."
+#endif
+
+template <uint32_t kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
+    ConvParamsSerializationTypeV3 state) {
+  auto [version, config_vals, tensors] = state;
+  TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version);
+
+  TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size());
+  c10::optional<at::Tensor> weight = tensors[1];
+  c10::optional<at::Tensor> bias = tensors[2];
+  TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv.");
+
+  torch::List<int64_t> stride, padding, output_padding, dilation;
+  // skip kSpatialDim
+  int idx = 1;
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    stride.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    dilation.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    TORCH_INTERNAL_ASSERT(idx < static_cast<int64_t>(config_vals.size()),
+        "Unexpected index = ", idx, " for config_vals of size ",
+        config_vals.size());
+    output_padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  int64_t groups = config_vals.at(idx);
+  idx++;
+  int64_t flags = config_vals.at(idx);
+  idx++;
+  TORCH_INTERNAL_ASSERT(idx == static_cast<int64_t>(config_vals.size()),
+      "Unexpected length of config_vals, expected ",
+      idx,
+      " got ",
+      config_vals.size());
+
+  bool transpose = flags & (1 << 0);
+
+  int64_t other_flags = flags & ~(1 << 0);
+  TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, ".");
+
+  auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::X86) {
+#if AT_MKLDNN_ENABLED()
+    bool use_onednn = onednn_utils::should_use_onednn_quant(
+        weight.value(), transpose, groups, output_padding);
+    if (use_onednn) {
+      return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+        weight.value(),
+        bias,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        transpose
+      );
+    }
+#endif
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  } // x86
+#endif
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::FBGEMM) {
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_FBGEMM
+#ifdef USE_PYTORCH_QNNPACK
+  if (ctx.qEngine() == at::QEngine::QNNPACK) {
+    TORCH_CHECK(
+        kSpatialDim == 2,
+        "prepack/__setstate__: QNNPACK only supports Conv2d "
+        "now.");
+    return PackedConvWeightsQnnp<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::ONEDNN) {
+    return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // AT_MKLDNN_ENABLED()
+TORCH_CHECK(
+  false,
+  "Didn't find engine for when deserializing ConvPackedParams: ",
+  toString(ctx.qEngine()));
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbfb406ea55dbb50f97b1e86efb52c337af04847
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+
+namespace at {
+namespace native {
+
+void initQNNPACK();
+
+} // namespace native
+} // namespace at
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
new file mode 100644
index 0000000000000000000000000000000000000000..86ed0f530f9c315410f8dbf2bb14d353cb92730c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <cstdint>
+
+namespace at {
+namespace native {
+Tensor& embedding_bag_byte_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& embedding_bag_4bit_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight);
+
+} // native
+} // at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..59d9682e345b4440e103a1f95c6da42208764aba
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace torch::verbose {
+TORCH_API int _mkl_set_verbose(int enable);
+TORCH_API int _mkldnn_set_verbose(int level);
+} // namespace torch::verbose
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..57a5999e055083a5aebe599ac25fa68736d3e767
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_sum_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim);
+};
+
+struct TORCH_API _sparse_sum_backward_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f9dcd4e1ed0ddf5157fc17f67554ef0cf9a6d69
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_autograd_multiple_dispatch_view_copy {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API _test_autograd_multiple_dispatch_view_copy_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6332bfdc50e0112e2184f7f02da88f1b15b2f284
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor atan(const at::Tensor & self);
+TORCH_API at::Tensor & atan_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & atan_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & atan_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..87d3f521d623b05e3ae74ba8e93887ccd05a1831
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> geqrf(const at::Tensor & self);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> geqrf_out(at::Tensor & a, at::Tensor & tau, const at::Tensor & self);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> geqrf_outf(const at::Tensor & self, at::Tensor & a, at::Tensor & tau);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..036ec5cf8388536fa56c0f19ef6517cb5fdcc8d6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_i0 : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..66dee7e90ee1c980cbac2acb5f90325462a797cb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API is_pinned {
+  using schema = bool (const at::Tensor &, c10::optional<at::Device>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::is_pinned")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "is_pinned(Tensor self, Device? device=None) -> bool")
+  static bool call(const at::Tensor & self, c10::optional<at::Device> device);
+  static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Device> device);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h
new file mode 100644
index 0000000000000000000000000000000000000000..89d6e402b11a2fb577baafef460e252c3e3ae663
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/isfinite_ops.h>
+
+namespace at {
+
+
+// aten::isfinite(Tensor self) -> Tensor
+inline at::Tensor isfinite(const at::Tensor & self) {
+    return at::_ops::isfinite::call(self);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8eabbe79cd4e5198775c977d1e4ab92bfa0ea03
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & linalg_eigvals_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & linalg_eigvals_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h
new file mode 100644
index 0000000000000000000000000000000000000000..71b55d18a23e49c239c8221343a32078b53b667b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/logical_or_ops.h>
+
+namespace at {
+
+
+// aten::logical_or(Tensor self, Tensor other) -> Tensor
+inline at::Tensor logical_or(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::logical_or::call(self, other);
+}
+
+// aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logical_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::logical_or_out::call(self, other, out);
+}
+// aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logical_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::logical_or_out::call(self, other, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..37cfa9117a63e9854c5f25d040cc52704dd58237
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API multinomial_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, bool, c10::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API multinomial {
+  using schema = at::Tensor (const at::Tensor &, int64_t, bool, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional<at::Generator> generator);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0017b5ca6edbf03d09da17c8475060328e83c9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9051dffb2d89bcab3da645dc0314068df97695d6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor nested_to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f7d5a090449fd008bc2a70f3d3f26ad18ae683c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h
@@ -0,0 +1,32 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & normal_(at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor normal(const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, double std, c10::optional<at::Generator> generator, at::Tensor & out);
+TORCH_API at::Tensor normal(double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_out(at::Tensor & out, double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_outf(double mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out);
+TORCH_API at::Tensor normal(const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt);
+TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c6bcbfc97fb839cb850562c37df29668341c78c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h
@@ -0,0 +1,27 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor remainder(const at::Scalar & self, const at::Tensor & other);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9179e7e3e4599013de49c723c8838e9d9a72e85c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & select_copy_int_out_symint(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out);
+TORCH_API at::Tensor select_copy_sparse_csr(const at::Tensor & self, int64_t dim, int64_t index);
+TORCH_API at::Tensor select_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt index);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8797587a5c47da989771e0fd3dd15b2cc2337ecb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_erfcx(const at::Tensor & self);
+TORCH_API at::Tensor & special_erfcx_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_erfcx_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..efcfe502b8cbaf3c5ac37dc26ad23694656ad573
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor special_laguerre_polynomial_l(const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_laguerre_polynomial_l_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_laguerre_polynomial_l_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..83899990b92e9a8e14afc117c27961e0beae7a09
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong);
+TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a165c17de22fdf03517f50674497d7f52dcbd1f5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor trunc(const at::Tensor & self);
+TORCH_API at::Tensor & trunc_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & trunc_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & trunc_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f92307c4aead7aa1f466dd68962c87a2383794
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor trunc(const at::Tensor & self);
+TORCH_API at::Tensor & trunc_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & trunc_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & trunc_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..e49e02330fe45680e654c4c6e28bf84d65e2294e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/upsample_linear1d_meta.h>
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor upsample_linear1d(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors);
+struct TORCH_API structured_upsample_linear1d_out_cpu : public at::meta::structured_upsample_linear1d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, bool align_corners, c10::optional<double> scales, const at::Tensor & out);
+};
+struct TORCH_API structured_upsample_linear1d_out_cuda : public at::meta::structured_upsample_linear1d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, bool align_corners, c10::optional<double> scales, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at