koichi12 commited on Feb 12, 2025

Commit

49fc886

verified ·

1 Parent(s): 22f3d85

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py +656 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/coordinate_descent_tuner.py +315 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/debug.py +655 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py +678 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/test_operators.py +24 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/virtualized.py +351 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h +98 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h +39 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h +173 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h +446 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h +394 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h +16 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h +41 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h +160 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h +157 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h +97 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h +27 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Padding.h +62 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/PointwiseOps.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pool.h +340 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RNN.h +53 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h +48 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h +173 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h +75 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h +544 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h +190 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/StridedRandomAccessor.h +301 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h +92 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h +55 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h +142 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h +52 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorProperties.h +12 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h +105 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TopKImpl.h +98 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TransposeType.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h +49 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h +112 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h +506 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h +48 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h +35 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h +494 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h +672 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Distributions.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh +681 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh +321 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh +187 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py ADDED Viewed

	@@ -0,0 +1,656 @@

+from __future__ import annotations
+import contextlib
+import dataclasses
+import functools
+import logging
+import os
+import queue
+import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from ctypes import byref, c_size_t, c_void_p
+from multiprocessing.process import BaseProcess
+from multiprocessing.queues import Queue
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Union,
+)
+import torch
+from torch import multiprocessing
+from torch._dynamo.testing import rand_strided
+from torch._inductor import ir
+from torch._inductor.codecache import CUDACodeCache, DLLWrapper, PyCodeCache
+if TYPE_CHECKING:
+    from torch._inductor.select_algorithm import TritonTemplateCaller
+from . import config
+from .utils import do_bench
+from .virtualized import V
+CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+EXIT_HANDLER_REGISTERED = False
+log = logging.getLogger(__name__)
+# Used to synchronize between parent and child processes
+class Ping:
+    pass
+class Pong:
+    pass
+@contextlib.contextmanager
+def set_cuda_visible_device(device: Optional[int]):
+    """
+    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
+    specified single device. If device is None, don't manipulate the environment.
+    """
+    if device is None:
+        yield
+        return
+    current = os.environ.get(CUDA_VISIBLE_DEVICES)
+    os.environ[CUDA_VISIBLE_DEVICES] = str(device)
+    try:
+        yield
+    finally:
+        if current is None:
+            del os.environ[CUDA_VISIBLE_DEVICES]
+        else:
+            os.environ[CUDA_VISIBLE_DEVICES] = current
+@dataclasses.dataclass
+class TuningProcess:
+    """
+    Abstraction for launching a helper process to benchmark kernels. Spawns
+    the parent process and uses multiprocessing queues to send benchmark
+    requests and return results.
+    """
+    device: Optional[int] = None
+    process: Optional[BaseProcess] = None
+    request_queue: Optional[Queue[Any]] = None
+    response_queue: Optional[Queue[Any]] = None
+    @staticmethod
+    def process_main(
+        request_queue: Queue[Any],
+        response_queue: Queue[Any],
+    ) -> None:
+        """
+        Entry point for the child process.
+        """
+        log.debug(
+            "Entering TuningProcess child. Visible devices = %s",
+            os.environ.get(CUDA_VISIBLE_DEVICES),
+        )
+        try:
+            TuningProcess.workloop(request_queue, response_queue)
+        except Exception as ex:
+            log.exception("Exception in TuningProcess: %s", ex)
+    @staticmethod
+    def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
+        """
+        Work loop for the benchmarking subprocess.
+        """
+        while True:
+            obj = request_queue.get()
+            if obj is None:
+                break  # None is a sentinel for the child to terminate
+            elif isinstance(obj, Ping):
+                response_queue.put(Pong())
+            elif isinstance(obj, BenchmarkRequest):
+                response_queue.put(obj.benchmark())
+            else:
+                raise RuntimeError(f"Invalid request type {type(obj)}")
+    def valid(self) -> bool:
+        """
+        True if the sub-process has been initialized.
+        """
+        return (
+            self.process is not None
+            and self.request_queue is not None
+            and self.response_queue is not None
+        )
+    def clear(self) -> None:
+        """
+        Reset to an uninitialized state.
+        """
+        self.process = self.request_queue = self.response_queue = None
+    def initialize(self) -> None:
+        """
+        Create child process, request/response queues, and do the warm up.
+        Set the environment to make only the provided GPU device visible
+        to the process.
+        """
+        if self.valid():
+            return
+        # cuda runtime does not work with "fork", use "spawn" to start processes.
+        ctx = multiprocessing.get_context("spawn")
+        self.request_queue = ctx.Queue()
+        self.response_queue = ctx.Queue()
+        self.process = ctx.Process(
+            target=self.process_main,
+            args=(
+                self.request_queue,
+                self.response_queue,
+            ),
+        )
+        assert self.process is not None
+        with set_cuda_visible_device(self.device):
+            self.process.start()
+    def put(self, obj: Any) -> None:
+        """
+        Push a work item to the child process.
+        """
+        # In case of a prior crash, ensure the subprocess is running
+        self.initialize()
+        assert self.request_queue is not None
+        self.request_queue.put(obj)
+    def get(self) -> Any:
+        """
+        Get a response from the child process.
+        """
+        assert self.process is not None
+        assert self.response_queue is not None
+        while True:
+            try:
+                return self.response_queue.get(timeout=1.0)
+            except queue.Empty:
+                status = self.process.exitcode
+                if status is None:
+                    # child process is still running
+                    continue
+                # child process crashed
+                self.clear()
+                raise
+    def terminate(self) -> None:
+        """
+        Signal the child process to terminate.
+        """
+        if self.valid():
+            assert self.process is not None
+            assert self.request_queue is not None
+            self.request_queue.put(None)
+    def wait(self) -> None:
+        """
+        Wait for the child process to exit.
+        """
+        if self.process is not None:
+            self.process.join()
+            self.clear()
+@dataclasses.dataclass
+class TuningProcessPool:
+    """
+    Maintains a pool of TuningProcesses to benchmark kernels in parallel
+    across devices. By default, we create one TuningProcess per device and
+    set the sub-process environment to make only that device visible.
+    """
+    processes: Optional[queue.Queue[TuningProcess]] = None
+    executor: Optional[ThreadPoolExecutor] = None
+    def initialize(self) -> None:
+        """
+        Start the child processes.
+        """
+        assert (self.processes is None) == (self.executor is None)
+        if self.processes is not None:
+            return
+        devices = self.get_device_list()
+        log.debug("Sub-process autotune device list: %s", devices)
+        # Launch the child processes and push a msg to "warm up"
+        self.processes = queue.Queue()
+        for device in devices:
+            p = TuningProcess(device=device)
+            p.initialize()
+            p.put(Ping())
+            self.processes.put(p)
+        # Wait for the initialization to finish
+        for p in self.processes.queue:
+            assert isinstance(p.get(), Pong)
+        # Use a thread pool to manage distributing work to the subprocesses.
+        # Threads block on an available process, so it makes sense to match
+        # the number of threads with the number of devices.
+        self.executor = ThreadPoolExecutor(max_workers=len(devices))
+        # Register the exit handler for the parent process so it will terminate
+        # the child processes.
+        global EXIT_HANDLER_REGISTERED
+        if not EXIT_HANDLER_REGISTERED:
+            EXIT_HANDLER_REGISTERED = True
+            import atexit
+            atexit.register(self.terminate)
+    def get_device_list(self) -> Sequence[Optional[int]]:
+        """
+        Gather the list of devices to be used in the pool.
+        """
+        if not config.autotune_multi_device:
+            # Don't use multiple devices
+            return [None]
+        count = torch.cuda.device_count()
+        # If the user specified the visible devices in the env, use those.
+        if CUDA_VISIBLE_DEVICES in os.environ:
+            devices = [int(d) for d in os.environ[CUDA_VISIBLE_DEVICES].split(",")]
+            assert len(devices) <= count
+            return devices
+        return list(range(count))
+    def terminate(self) -> None:
+        """
+        Signal all child processes to terminate.
+        """
+        if self.executor is not None:
+            self.executor.shutdown()
+            self.executor = None
+        if self.processes is not None:
+            for p in self.processes.queue:
+                p.terminate()
+            for p in self.processes.queue:
+                p.wait()
+            self.processes = None
+    def target(self, choice: TritonTemplateCaller) -> float:
+        """
+        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
+        remove it from the queue, execute the benchmark in that subprocess, and return
+        the TuningProcess to the queue.
+        """
+        assert choice.bmreq is not None
+        assert self.processes is not None
+        process = self.processes.get()
+        process.put(choice.bmreq)
+        try:
+            return process.get()
+        except queue.Empty:
+            warnings.warn(
+                f"Failed to benchmark choice '{choice}'. It will be ignored. "
+                "Please debug the root cause in case the choice can bring perf gains."
+            )
+            # set to INF so this choice will be ignored
+            return float("inf")
+        finally:
+            self.processes.put(process)
+    def benchmark(
+        self,
+        choices: List[TritonTemplateCaller],
+    ) -> Dict[TritonTemplateCaller, float]:
+        """
+        Benchmark each choice in a separate process.
+        """
+        assert self.processes is not None, "Tuning process pool is not initialized"
+        assert self.executor is not None
+        results = {}
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
+        # to grab subprocesses as soon as they're free.
+        for choice, result in zip(choices, self.executor.map(self.target, choices)):
+            results[choice] = result
+        return results
+tuning_pool = TuningProcessPool()
+LayoutOrBuffer = Union[ir.Layout, ir.Buffer]
+@dataclasses.dataclass
+class TensorMeta:
+    device: torch.device
+    dtype: torch.dtype
+    sizes: torch._prims_common.ShapeType
+    strides: torch._prims_common.StrideType
+    offset: int
+    @classmethod
+    def from_irnodes(
+        cls, irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]
+    ) -> Union[TensorMeta, List[TensorMeta]]:
+        if isinstance(irnodes, Sequence):
+            result: List[Any] = [cls.from_irnodes(x) for x in irnodes]
+            assert all(isinstance(x, TensorMeta) for x in result)
+            return result
+        node = irnodes
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+        dtype = node.get_dtype()
+        assert dtype is not None
+        return TensorMeta(
+            device=node.get_device(),
+            dtype=dtype,
+            sizes=V.graph.sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            strides=V.graph.sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            offset=V.graph.sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+        )
+    def to_tensor(self) -> torch.Tensor:
+        return rand_strided(
+            self.sizes,
+            self.strides,
+            device=self.device,
+            dtype=self.dtype,
+            extra_size=self.offset,
+        )
+@dataclasses.dataclass
+class BenchmarkRequest:
+    """
+    Only handle triton template benchmark for now. The extern kernel benchmark
+    can be done inside the same process since they usually don't cause crash.
+    Important: Instances of this class and subclasses have to be serializable
+    across process boundaries. Do not put CUDA Tensors in here!
+    """
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+    ):
+        # the kernel name defined in the module
+        self.kernel_name = kernel_name
+        if isinstance(input_tensor_meta, TensorMeta):
+            input_tensor_meta = [input_tensor_meta]
+        self.input_tensor_meta = input_tensor_meta
+        if isinstance(output_tensor_meta, (tuple, list)):
+            assert len(output_tensor_meta) == 1
+            output_tensor_meta = output_tensor_meta[0]
+        self.output_tensor_meta = output_tensor_meta
+        self.extra_args = extra_args
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        raise NotImplementedError()
+    def cleanup_run_fn(self) -> None:
+        pass
+    def benchmark(
+        self,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        debug = log.isEnabledFor(logging.DEBUG)
+        if debug:
+            start_ts = time.time()
+        # create args and out tensor
+        if output_tensor is None:
+            assert len(input_tensors) == 0
+            input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
+            output_tensor = self.output_tensor_meta.to_tensor()
+        if debug:
+            create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+        fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+        if debug:
+            load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+        out = do_bench(fn)
+        torch.cuda.synchronize()  # shake out any CUDA errors
+        if debug:
+            bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            log.debug(
+                "InChildProcess %s: load %f, create tensor %f, bench %f",
+                str(self),
+                load_elapse,  # type: ignore[possibly-undefined]
+                create_tensor_elapse,  # type: ignore[possibly-undefined]
+                bench_elapse,
+            )
+        self.cleanup_run_fn()
+        return out
+class TestBenchmarkRequest(BenchmarkRequest):
+    """
+    Supports unit testing. Defined in this file so that the TuningProcess
+    sub-process knows how to unpickle these objects.
+    """
+    def __init__(self, value: Optional[float] = None) -> None:
+        self.value = value
+    def benchmark(
+        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
+    ) -> float:
+        if self.value is None:
+            raise Exception("Failed to run")
+        return self.value
+class TritonBenchmarkRequest(BenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        module_path: str,  # the path of the module defining the triton kernel
+        module_cache_key: str,
+        grid: List[int],
+        num_stages: int,
+        num_warps: int,
+        matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
+    ):
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.module_path = module_path
+        self.module_cache_key = module_cache_key
+        self.grid = grid
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.matrix_instr_nonkdim = matrix_instr_nonkdim
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+        log.debug(
+            "benchmark module key: %s, path: %s",
+            self.module_cache_key,
+            self.module_path,
+        )
+        run_method = getattr(mod, self.kernel_name).run
+        extra_args = list(self.extra_args)
+        # Newer version of triton add warmup argument to JITFunction.run.
+        # This code handles backward-compatibility.
+        warmup_arg = {}
+        import inspect
+        if "warmup" in inspect.signature(run_method).parameters:
+            warmup_arg["warmup"] = False
+        if torch.version.hip and self.matrix_instr_nonkdim != 0:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
+                matrix_instr_nonkdim=self.matrix_instr_nonkdim,
+            )
+        else:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
+            )
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}"
+class CUDABenchmarkRequest(BenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        source_code: str,
+    ):
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.source_code = source_code
+        self.workspace_size: int = 0
+        self.workspace: Optional[torch.Tensor] = None
+        self.DLL: Optional[DLLWrapper] = None
+        self.hash_key: str = ""
+        self.source_file: str = ""
+        self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
+    def precompile(self):
+        # Prepopulate CUDACodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CUDACodeCache.load(self.source_code, "so")
+        log.debug("Done precompiling %s", self)
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
+            self.source_code, "so"
+        )
+        args = [
+            c_void_p(tensor.data_ptr())
+            for tensor in list(input_tensors) + [output_tensor]
+        ]
+        log.debug(
+            "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
+            self.kernel_name,
+            self.source_file,
+            self.hash_key,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        run_method = getattr(self.DLL, self.kernel_name)
+        stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
+        # Retrieve workspace_size and initialize workspace.
+        c_workspace_size = c_size_t()
+        run_method(
+            *args,  # input ptrs and output ptrs
+            *self.extra_args,
+            byref(
+                c_workspace_size
+            ),  # set workspace size ptr to retrieve workspace size
+            None,  # null workspace ptr
+            stream_ptr,
+        )
+        self.workspace_size = c_workspace_size.value
+        # TODO: Support non-zero workspace_size.
+        assert self.workspace_size == 0, (
+            "Things need to be fixed to support non-zero workspace_size: "
+            "1) max autotune cache needs to store workspace size; "
+            "2) memory allocation needs to allocate / deallocate workspace correctly; "
+        )
+        # Generate partial function.
+        return functools.partial(
+            run_method,
+            *args,
+            *self.extra_args,
+            None,  # null workspace size ptr
+            None,  # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0
+            stream_ptr,
+        )
+    def cleanup_run_fn(self) -> None:
+        if self.DLL is not None:
+            self.DLL.close()
+        self.workspace = None
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+def benchmark_in_sub_process(
+    choices: List[TritonTemplateCaller],
+) -> Dict[TritonTemplateCaller, float]:
+    """
+    Do benchmarking in a subprocess and return the perf number (latency).
+    """
+    return tuning_pool.benchmark(choices)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/coordinate_descent_tuner.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import copy
+import itertools
+import logging
+from typing import Callable, Optional
+from torch.utils._triton import has_triton
+from .utils import red_text, triton_config_to_hashable
+if has_triton():
+    import triton
+else:
+    triton = None
+from . import config as inductor_config
+log = logging.getLogger(__name__)
+def get_field(config, name):
+    if name == "num_warps":
+        return config.num_warps
+    elif name == "num_stages":
+        return config.num_stages
+    else:
+        return config.kwargs.get(name, None)
+def set_field(config, name, value):
+    if name == "num_warps":
+        config.num_warps = value
+    elif name == "num_stages":
+        config.num_stages = value
+    else:
+        config.kwargs[name] = value
+class CoordescTuner:
+    """
+    The coordinate descent tuner. Tune one field/coordinate at a time.
+    TODO will it be necessary to tune multiple fields simultaneously.
+    TODO: what if both increasing and decreasing a field can improve perf.
+          i.e., there are multiple local optima..
+    """
+    def __init__(self, is_mm=False, name="unknown", size_hints=None):
+        self.is_mm = is_mm  # we will tune num_stages for mm
+        self.cached_benchmark_results = {}
+        self.name = name
+        self.size_hints = size_hints
+    def get_xmax(self):
+        xmax = inductor_config.triton.max_block["X"]
+        if self.size_hints and len(self.size_hints) > 0:
+            xmax = min(xmax, self.size_hints[0])
+        return xmax
+    def get_ymax(self):
+        ymax = inductor_config.triton.max_block["Y"]
+        if self.size_hints and len(self.size_hints) > 1:
+            ymax = min(ymax, self.size_hints[1])
+        return ymax
+    def get_zmax(self):
+        zmax = inductor_config.triton.max_block["Z"]
+        if self.size_hints and len(self.size_hints) > 2:
+            zmax = min(zmax, self.size_hints[2])
+        return zmax
+    def get_rmax(self):
+        if self.size_hints and len(self.size_hints) > 0:
+            return self.size_hints[-1]  # the last one is for reduction
+        else:
+            # large enough. We should not pick this large RBLOCK anyway
+            return 2**30
+    def get_warpsmax(self):
+        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
+        # number of warps.
+        return 1024 // 32
+    def cache_benchmark_result(self, config, timing):
+        self.cached_benchmark_results[triton_config_to_hashable(config)] = timing
+    def lookup_in_cache(self, config):
+        return self.cached_benchmark_results.get(triton_config_to_hashable(config))
+    def call_func(self, func, config):
+        found = self.lookup_in_cache(config)
+        if found is not None:
+            log.debug("  CACHED")
+            return found
+        timing = func(config)
+        self.cache_benchmark_result(config, timing)
+        return timing
+    @property
+    def tunable_fields(self):
+        out = [
+            "XBLOCK",
+            "YBLOCK",
+            "ZBLOCK",
+            # NOTE: we should not tune RBLOCK for persistent reduction.
+            # We rely on the fact that persistent reduction's triton.Config
+            # does not have the RBLOCK field to guarantee that.
+            "RBLOCK",
+            # the following 3 are for mm
+            "BLOCK_M",
+            "BLOCK_N",
+            "BLOCK_K",
+            "num_warps",
+        ]
+        if self.is_mm:
+            out.append("num_stages")
+        return out
+    def value_too_large(self, name, val):
+        if name == "XBLOCK":
+            return val > self.get_xmax()
+        if name == "YBLOCK":
+            return val > self.get_ymax()
+        if name == "ZBLOCK":
+            return val > self.get_zmax()
+        if name == "RBLOCK":
+            return val > self.get_rmax()
+        if name == "num_warps":
+            return val > self.get_warpsmax()
+        return False
+    def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
+        """
+        Get neighbour values in 'radius' steps. The original value is not
+        returned as it's own neighbour.
+        """
+        assert radius >= 1
+        def update(cur_val, inc=True):
+            if name == "num_stages":
+                if inc:
+                    return cur_val + 1
+                else:
+                    return cur_val - 1
+            else:
+                if inc:
+                    return cur_val * 2
+                else:
+                    return cur_val // 2
+        out = []
+        # increment loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, True)
+            if self.value_too_large(name, cur_val):
+                break
+            out.append(cur_val)
+        # decrement loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, False)
+            if cur_val <= 0:
+                break
+            out.append(cur_val)
+        if include_self:
+            out.append(orig_val)
+        return out
+    @staticmethod
+    def has_improvement(baseline, test):
+        threshold = 0.001  # 0.1%
+        return test is not None and test < baseline * (1 - threshold)
+    def check_all_tuning_directions(
+        self,
+        func: Callable[["triton.Config"], float],
+        best_config,
+        best_timing,
+    ):
+        """
+        Check all directions. We only do this once the regular coordinate
+        descent tuning find no better choices any more.
+        We only have a few tunable fields, so this should be fine.
+        """
+        candidate_values_list = []
+        effective_fields = []
+        for field in self.tunable_fields:
+            old_value = get_field(best_config, field)
+            if old_value is None:
+                continue
+            candidate_values = self.get_neighbour_values(
+                field,
+                old_value,
+                radius=inductor_config.coordinate_descent_search_radius,
+                include_self=True,
+            )
+            candidate_values_list.append(candidate_values)
+            effective_fields.append(field)
+        choices = itertools.product(*candidate_values_list)
+        improved = False
+        for choice in choices:
+            assert len(choice) == len(effective_fields)
+            candidate_config = copy.deepcopy(best_config)
+            for new_val, field in zip(choice, effective_fields):
+                set_field(candidate_config, field, new_val)
+            cmp_res, candidate_timing = self.compare_config(
+                func, candidate_config, best_config, best_timing
+            )
+            if cmp_res:
+                improved = True
+                best_config = candidate_config
+                best_timing = candidate_timing
+        return improved, best_config, best_timing
+    def compare_config(self, func, candidate_config, best_config, best_timing):
+        """
+        Check if candidate_config is better than best_config.
+        Return a touple of (compare_result, candidate_timing).
+        compare_result is true iff candidate_config is better.
+        """
+        log.debug("Try config %s", candidate_config)
+        try:
+            candidate_timing = self.call_func(func, candidate_config)
+        except Exception as e:
+            log.debug("Got exception %s", e)
+            return False, float("inf")
+        if self.has_improvement(best_timing, candidate_timing):
+            log.debug(
+                "Tune from %s %f -> %s %f",
+                best_config,
+                best_timing,
+                candidate_config,
+                candidate_timing,
+            )
+            return True, candidate_timing
+        return False, candidate_timing
+    def autotune(
+        self,
+        func: Callable[["triton.Config"], float],
+        baseline_config: "triton.Config",
+        baseline_timing: Optional[float] = None,
+    ) -> "triton.Config":
+        if baseline_timing is None:
+            baseline_timing = self.call_func(func, baseline_config)
+        log.debug("= Do coordinate descent tuning for %s =", self.name)
+        log.debug(
+            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+        )
+        improved = True
+        best_config = baseline_config
+        best_timing = baseline_timing
+        tunable_fields = self.tunable_fields
+        while improved:
+            improved = False
+            for name in tunable_fields:
+                cur_val = get_field(best_config, name)
+                # some kernel don't have RBLOCK/YBLOCK/ZBLOCK. So cur_val may be None
+                if cur_val is None:
+                    continue
+                # It's possible that candidate_values is empty.
+                # E.g., if XBLOCK is 1 initially and size_hint for x is also 1.
+                # We would not try either larger or smaller XBLOCK in this case.
+                candidate_values = self.get_neighbour_values(name, cur_val)
+                for next_val in candidate_values:
+                    candidate_config = copy.deepcopy(best_config)
+                    set_field(candidate_config, name, next_val)
+                    cmp_res, candidate_timing = self.compare_config(
+                        func, candidate_config, best_config, best_timing
+                    )
+                    if cmp_res:
+                        improved = True
+                        best_config, best_timing = candidate_config, candidate_timing
+            if not improved and inductor_config.coordinate_descent_check_all_directions:
+                old_best_timing = best_timing
+                improved, best_config, best_timing = self.check_all_tuning_directions(
+                    func, best_config, best_timing
+                )
+                if improved:
+                    msg = red_text(
+                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                    )
+                    log.debug(
+                        msg,
+                        old_best_timing / best_timing,
+                    )
+        log.debug(
+            "Improve from %s %f -> %s %f, %.3fx",
+            baseline_config,
+            baseline_timing,
+            best_config,
+            best_timing,
+            baseline_timing / best_timing,
+        )
+        return best_config

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/debug.py ADDED Viewed

	@@ -0,0 +1,655 @@

+import collections
+import contextlib
+import cProfile
+import dataclasses
+import functools
+import itertools
+import logging
+import os
+import os.path
+import pickle
+import pstats
+import shutil
+import subprocess
+from typing import Any, Dict, List, Optional
+from unittest.mock import patch
+from functorch.compile import draw_graph, get_aot_graph_name, get_graph_being_compiled
+import torch
+from torch import fx as fx
+from torch._dynamo.repro.after_aot import save_graph_repro, wrap_compiler_debug
+from torch._dynamo.utils import get_debug_dir
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.fx.passes.tools_common import legalize_graph
+from torch.utils._pytree import tree_map
+from . import config, ir  # noqa: F811, this is needed
+from .scheduler import (
+    BaseSchedulerNode,
+    FusedSchedulerNode,
+    NopKernelSchedulerNode,
+    OutputNode,
+    SchedulerNode,
+)
+from .virtualized import V
+log = logging.getLogger(__name__)
+SchedulerNodeList = List[Any]
+BufMeta = collections.namedtuple("BufMeta", ["name", "n_origin"])
+GRAPHVIZ_COMMAND_SCALABLE = ["dot", "-Gnslimit=2", "-Gnslimit1=2", "-Gmaxiter=5000"]
+@functools.lru_cache(None)
+def has_dot() -> bool:
+    try:
+        subprocess.check_output(["which", "dot"], stderr=subprocess.PIPE)
+        return True
+    except subprocess.SubprocessError:
+        return False
+def draw_buffers(nodes: List[BaseSchedulerNode], print_graph=False, fname=None):
+    """
+    Draw a graph in fname.svg.
+    """
+    if not has_dot():
+        log.warning("draw_buffers() requires `graphviz` package")
+        return
+    if fname is None:
+        fname = get_graph_being_compiled()
+    graph = create_fx_from_snodes(nodes)
+    for node in graph.nodes:
+        if "fusion_meta" not in node.meta:
+            continue
+        group = node.meta["fusion_meta"].group
+        if isinstance(group, tuple):
+            if isinstance(group[1], int):
+                group = (group[1],)
+            else:
+                group = group[1]
+        # gather meta data
+        dtype = None
+        if isinstance(node, ir.ComputedBuffer):
+            dtype = node.data.dtype
+        metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
+        node.meta["tensor_meta"] = metadata
+    if print_graph:
+        print(graph)
+    gm = GraphModule({}, graph)
+    legalize_graph(gm)
+    gm.graph.lint()
+    draw_graph(
+        gm, fname, clear_meta=False, dot_graph_shape=config.trace.dot_graph_shape
+    )
+def create_fx_from_snodes(snodes: List[BaseSchedulerNode]) -> fx.Graph:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+    def get_fake_func(name):
+        def func1(*args):
+            return 0
+        func1.__name__ = name
+        return func1
+    FusionMeta = collections.namedtuple("FusionMeta", ["group", "snode", "type"])
+    buf_to_fx_node = {}
+    graph = torch.fx.Graph()
+    first_node = None
+    outputs = []
+    group: Any = None
+    # create call_function node for each Buffer and Kernel
+    for snode in snodes:
+        if snode.is_extern():
+            node_type = "extern"
+            group = node_type
+        elif snode.is_template():
+            node_type = "template"
+            group = node_type
+        elif isinstance(snode, NopKernelSchedulerNode):
+            node_type = "nop"
+            group = node_type
+        elif isinstance(snode, SchedulerNode):
+            node_type = "compute"
+            group = snode.group
+        elif isinstance(snode, FusedSchedulerNode):
+            node_type = "fused"
+            group = snode.group
+        else:
+            raise RuntimeError("Unknown node type")
+        fused_name = torch._inductor.utils.get_fused_kernel_name(
+            snode.get_nodes(), "original_aten"
+        )
+        func_name = f"{node_type}: {fused_name}"
+        node_func = get_fake_func(func_name)
+        kwargs = {}
+        if hasattr(snode, "get_device"):
+            kwargs = {"device": snode.get_device()}
+        fx_node = graph.call_function(node_func, args=(), kwargs=kwargs)
+        def in_output(snode):
+            if isinstance(snode, FusedSchedulerNode):
+                return any(in_output(x) for x in snode.snodes)
+            return any(isinstance(user.node, OutputNode) for user in snode.users)
+        if in_output(snode):
+            outputs.append(fx_node)
+        name = snode.get_name()
+        fx_node.name = name
+        fx_node.meta["fusion_meta"] = FusionMeta(group, snode, node_type)
+        if isinstance(snode, FusedSchedulerNode):
+            for x in snode.snodes:
+                buf_to_fx_node[x.get_name()] = fx_node
+        buf_to_fx_node[name] = fx_node
+        if first_node is None:
+            first_node = fx_node
+    # create edges between nodes
+    for snode in snodes:
+        name = snode.get_name()
+        deps = snode.read_writes.reads
+        fx_node = buf_to_fx_node[name]
+        new_args = []
+        for dep in deps:
+            if dep.name in buf_to_fx_node:
+                dep_node = buf_to_fx_node[dep.name]
+            else:
+                with graph.inserting_before(first_node):
+                    dep_node = graph.placeholder(dep.name)
+                    buf_to_fx_node[dep.name] = dep_node
+            new_args.append(dep_node)
+        fx_node.args = tuple(new_args)
+    graph.output(outputs[0] if len(outputs) == 1 else tuple(outputs))
+    return graph
+def update_orig_fx_node_name_to_buf_name(
+    nodes: SchedulerNodeList,
+    node_name_to_buf_name: Dict[str, str],
+    parent_buf_name: Optional[str] = None,
+    n_origins: int = 0,
+):
+    if nodes is None:
+        return
+    for node in nodes:
+        # for FusedSchedulerNode, traverse recursively into get_nodes()
+        buf_name = node.get_name()
+        children_nodes = node.get_nodes()
+        if children_nodes is not None and len(children_nodes) > 1:
+            update_orig_fx_node_name_to_buf_name(
+                children_nodes,
+                node_name_to_buf_name,
+                buf_name if parent_buf_name is None else parent_buf_name,
+            )
+            continue
+        else:
+            assert len(children_nodes) == 1 and children_nodes[0] == node
+        ir_node = node.node
+        if ir_node is None or ir_node.origins is None:
+            continue
+        for origin in ir_node.origins:
+            node_name = origin.name
+            # when buf1 and buf2 both have origin=node1
+            # we draw node1 according to buf1
+            if node_name not in node_name_to_buf_name:
+                node_name_to_buf_name[node_name] = (
+                    buf_name if parent_buf_name is None else parent_buf_name
+                )
+def get_node_name_to_buf_meta(node_name_to_buf_name: Dict[str, str]):
+    buf_name_to_n_node = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        if buf_name not in buf_name_to_n_node:
+            buf_name_to_n_node[buf_name] = {node_name}
+        else:
+            buf_name_to_n_node[buf_name].add(node_name)
+    node_name_to_buf_meta = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        n_node = len(buf_name_to_n_node[buf_name])
+        node_name_to_buf_meta[node_name] = BufMeta(buf_name, n_node)
+    return node_name_to_buf_meta
+def annotate_orig_fx_with_snodes(
+    gm: torch.fx.GraphModule, snodes: SchedulerNodeList
+) -> None:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+    node_name_to_buf_name: Dict[str, str] = {}
+    update_orig_fx_node_name_to_buf_name(snodes, node_name_to_buf_name)
+    if node_name_to_buf_name is None:
+        return
+    node_name_to_buf_meta = get_node_name_to_buf_meta(node_name_to_buf_name)
+    for node in gm.graph.nodes:
+        if node.name in node_name_to_buf_meta:
+            node.meta["buf_meta"] = node_name_to_buf_meta.get(node.name)
+@contextlib.contextmanager
+def enable_aot_logging():
+    compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+    import torch._functorch.aot_autograd
+    log = logging.getLogger(torch._functorch.aot_autograd.__name__)
+    stack = contextlib.ExitStack()
+    if not compile_debug:
+        try:
+            yield
+        finally:
+            stack.close()
+        return
+    # Enable all graphs to be logged to a file by setting the flags to True
+    # and the log level of the file logger to DEBUG
+    stack.enter_context(patch("functorch.compile.config.debug_partitioner", True))
+    path = os.path.join(get_debug_dir(), "torchinductor")
+    os.makedirs(path, exist_ok=True)
+    fh = logging.FileHandler(
+        os.path.join(
+            path,
+            f"aot_{get_aot_graph_name()}_debug.log",
+        )
+    )
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(
+        logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+    )
+    log.addHandler(fh)
+    try:
+        yield
+    finally:
+        log.removeHandler(fh)
+        stack.close()
+class DebugContext:
+    _counter = itertools.count()
+    @staticmethod
+    def wrap(fn):
+        @functools.wraps(fn)
+        def inner(*args, **kwargs):
+            with DebugContext():
+                return fn(*args, **kwargs)
+        return wrap_compiler_debug(inner, compiler_name="inductor")
+    @staticmethod
+    def create_debug_dir(folder_name: str) -> Optional[str]:
+        debug_dir = config.trace.debug_dir or get_debug_dir()
+        for n in DebugContext._counter:
+            dirname = os.path.join(
+                debug_dir,
+                "torchinductor",
+                f"{folder_name}.{n}",
+            )
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+                return dirname
+        return None
+    def __init__(self):
+        self._prof = None
+        self._path = None
+        self._stack = contextlib.ExitStack()
+    def copy(self, new_path: str):
+        if not self._path:
+            return
+        assert new_path.endswith(".debug"), new_path
+        if os.path.exists(new_path):
+            shutil.rmtree(new_path)
+        try:
+            shutil.copytree(self._path, new_path)
+            self._path = new_path
+        except OSError:
+            log.warning(
+                "Failed to copy debug files from %s to %s", self._path, new_path
+            )
+            pass
+    def fopen(self, filename: str, write_mode: str = "w", *args, **kwargs):
+        assert self._path
+        return open(os.path.join(self._path, filename), write_mode, *args, **kwargs)
+    @contextlib.contextmanager
+    def fopen_context(self, filename: str, write_mode: str = "w", *args, **kwargs):
+        assert self._path
+        with open(os.path.join(self._path, filename), write_mode, *args, **kwargs) as f:
+            yield f
+    def filename(self, suffix: str):
+        assert self._path
+        return os.path.join(self._path, suffix)
+    def upload_tar(self):
+        if config.trace.upload_tar is not None:
+            import tarfile
+            assert self._path
+            tar_file = os.path.join(
+                self._path, f"{os.path.basename(self._path)}.tar.gz"
+            )
+            with tarfile.open(tar_file, "w:gz") as tar:
+                tar.add(self._path, arcname=os.path.basename(self._path))
+            config.trace.upload_tar(tar_file)
+    def __enter__(self):
+        if config.debug:
+            log = logging.getLogger("torch._dynamo")
+            prev_level = log.level
+            log.setLevel(logging.DEBUG)
+            def reset_log_level(level):
+                log.setLevel(level)
+            self._stack.callback(reset_log_level, prev_level)
+        self._stack.enter_context(V.set_debug_handler(self))
+        if not config.trace.enabled:
+            return
+        self._path = self.create_debug_dir(get_aot_graph_name())
+        if config.trace.debug_log:
+            self._setup_log_capture("debug.log", logging.DEBUG)
+        if config.trace.info_log:
+            self._setup_log_capture("info.log", logging.INFO)
+        if config.trace.compile_profile:
+            self._prof = cProfile.Profile()
+            self._prof.enable()
+    def _setup_log_capture(self, filename: str, level: int):
+        log = logging.getLogger("torch._inductor")
+        fd = self._stack.enter_context(self.fopen(filename))
+        ch = logging.StreamHandler(fd)
+        ch.setLevel(level)
+        ch.setFormatter(
+            logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+        )
+        log.addHandler(ch)
+        log.setLevel(min(log.level, level))
+        self._stack.callback(log.removeHandler, ch)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._prof:
+            self._prof.disable()
+            self._save_profile_data()
+        if self._path:
+            self.upload_tar()
+            log.warning("%s debug trace: %s", get_graph_being_compiled(), self._path)
+        self._stack.close()
+    def _save_profile_data(self):
+        assert self._prof
+        self._prof.dump_stats(self.filename("compile.prof"))
+        with self.fopen("compile.stats") as fd:
+            stats = pstats.Stats(self._prof, stream=fd)
+            stats.strip_dirs()
+            stats.sort_stats("cumtime")
+            stats.print_stats(100)
+            stats.sort_stats("tottime")
+            stats.print_stats(100)
+    def __getattr__(self, name):
+        if config.trace.enabled and getattr(config.trace, name):
+            try:
+                return getattr(DebugFormatter(self), name)
+            except Exception:
+                log.warning("Ignoring exception in debug code", exc_info=True)
+        else:
+            def ignored(*args, **kwargs):
+                pass
+            return ignored
+class DebugFormatter:
+    def __init__(self, handler):
+        self.fopen = handler.fopen
+        self.fopen_context = handler.fopen_context
+        self.filename = handler.filename
+        self.handler = handler
+    def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]):
+        with self.fopen("fx_graph_runnable.py") as fd:
+            save_graph_repro(fd, gm, inputs, "inductor")
+        with self.fopen("fx_graph_readable.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+    def fx_graph_transformed(
+        self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]
+    ):
+        with self.fopen("fx_graph_transformed.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+    def ir_pre_fusion(self, nodes: SchedulerNodeList):
+        self._write_ir("ir_pre_fusion.txt", nodes)
+    def ir_post_fusion(self, nodes: SchedulerNodeList):
+        self._write_ir("ir_post_fusion.txt", nodes)
+    def _write_ir(self, filename: str, nodes: SchedulerNodeList):
+        with self.fopen(filename) as fd:
+            log.info("Writing debug ir to  %s", fd.name)
+            for node in nodes:
+                fd.write(node.debug_str())
+                fd.write("\n\n\n")
+    def graph_diagram(self, nodes: SchedulerNodeList):
+        draw_buffers(nodes, fname=self.filename("graph_diagram.svg"))
+    def draw_orig_fx_graph(self, gm: torch.fx.GraphModule, nodes: SchedulerNodeList):
+        annotate_orig_fx_with_snodes(gm, nodes)
+        draw_graph(
+            gm,
+            fname=self.filename("orig_fx_graph_diagram.svg"),
+            clear_meta=False,
+            prog=GRAPHVIZ_COMMAND_SCALABLE,
+            parse_stack_trace=True,
+            dot_graph_shape=config.trace.dot_graph_shape,
+        )
+    def output_code(self, filename):
+        shutil.copy(filename, self.filename("output_code.py"))
+    def log_autotuning_results(
+        self,
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict["ChoiceCaller", float],  # type: ignore[name-defined] # noqa: F821
+        elapse: float,
+    ):
+        import json
+        from .ir import FixedLayout
+        def build_node_info(node: ir.IRNode):
+            if hasattr(node, "name"):
+                node_name = node.name
+            else:
+                node_name = ""
+            node_info = {
+                "name": node_name,
+                "type": type(node).__name__,
+            }
+            try:
+                layout = node.get_layout()
+                if isinstance(layout, FixedLayout):
+                    offset = 0
+                    try:
+                        offset = int(layout.offset)
+                    except Exception:
+                        try:
+                            offset = V.graph.sizevars.size_hint(
+                                layout.offset, fallback=0
+                            )
+                        except Exception:
+                            pass
+                    static_layout = FixedLayout(
+                        layout.device,
+                        dtype=layout.dtype,
+                        size=list(V.graph.sizevars.size_hints(layout.size)),
+                        stride=list(V.graph.sizevars.size_hints(layout.stride)),
+                        offset=offset,
+                    )
+                    node_info["layout"] = str(static_layout)
+                else:
+                    node_info["layout"] = str(node.get_layout())
+            except Exception as e:
+                pass
+            try:
+                node_info["dtype"] = str(node.get_dtype())
+            except Exception as e:
+                pass
+            try:
+                node_info["device"] = str(node.get_device())
+            except Exception as e:
+                pass
+            try:
+                node_info["stride"] = str(
+                    V.graph.sizevars.size_hints(node.get_stride())
+                )
+            except Exception as e:
+                pass
+            try:
+                node_info["size"] = str(V.graph.sizevars.size_hints(node.get_size()))
+            except Exception as e:
+                pass
+            try:
+                node_info["numel"] = str(V.graph.sizevars.size_hint(node.get_numel()))
+            except Exception as e:
+                pass
+            if hasattr(node, "data") and isinstance(node.data, ir.IRNode):
+                node_info["data"] = build_node_info(node.data)
+            return node_info
+        general_properties = {
+            "op_name": name,
+            "cuda_device_name": torch.cuda.get_device_name(),
+            "cuda_device_count": torch.cuda.device_count(),
+            "input_nodes": [build_node_info(node) for node in input_nodes],
+            "autotuning_time": elapse,
+        }
+        with self.fopen_context(
+            "autotuning_result_json_list.txt", "at", encoding="utf-8"
+        ) as fd:
+            for caller, time in timings.items():
+                info_dict = dict(caller.info_dict())
+                info_dict.update(general_properties)
+                info_dict["benchmark_result"] = time
+                json.dump(info_dict, fd)
+                fd.write("\n")
+@dataclasses.dataclass
+class TensorMetadataHolder:
+    tensor_metadata: TensorMetadata
+    device: torch.device
+save_args_cnt = itertools.count()
+def save_args_for_compile_fx_inner(*args, **kwargs):
+    """
+    This function is used to save arguments for a compile_fx_inner function call
+    to the file system.  Later on one can replay the compile_fx_inner call
+    with the saved arguments using load_args_and_run_compile_fx_inner.
+    """
+    folder = "/tmp/inductor_saved_args"
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    def handle_tensor(x):
+        """
+        Pickle FakeTensor will result in error:
+        AttributeError: Can't pickle local object 'WeakValueDictionary.__init__.<locals>.remove'
+        Convert all Tensor to metadata. This may also makes pickle faster.
+        """
+        if isinstance(x, torch.Tensor):
+            return TensorMetadataHolder(_extract_tensor_metadata(x), x.device)
+        else:
+            return x
+    args_to_save, kwargs_to_save = tree_map(handle_tensor, (args, kwargs))
+    fn_name = "compile_fx_inner"
+    path = f"{folder}/{fn_name}_{next(save_args_cnt)}.pkl"
+    with open(path, "wb") as f:
+        pickle.dump((args_to_save, kwargs_to_save), f)
+    if log.isEnabledFor(logging.DEBUG):
+        message = f"""
+Arguments for a compile_fx_inner call is saved to {path}. To replay the call,
+run the following:
+from torch._inductor.debug import load_args_and_run_compile_fx_inner
+load_args_and_run_compile_fx_inner({path!r})
+        """
+        # call print rather than log.debug. log.debug will print message
+        # prefix for each line which makes the code snippet harder to be
+        # copied.
+        # Not a big deal since the code is already been guarded by checking
+        # the log level.
+        print(message)
+def load_args_and_run_compile_fx_inner(path: str):
+    from torch._inductor.compile_fx import compile_fx_inner
+    with open(path, "rb") as f:
+        args, kwargs = pickle.load(f)
+    def handle_tensor(x):
+        if isinstance(x, TensorMetadataHolder):
+            return torch._dynamo.testing.rand_strided(
+                x.tensor_metadata.shape,
+                x.tensor_metadata.stride,
+                x.tensor_metadata.dtype,
+                x.device,
+            )
+        else:
+            return x
+    fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+    with fake_mode, config.patch("save_args", False):
+        args, kwargs = tree_map(handle_tensor, (args, kwargs))
+        return compile_fx_inner(*args, **kwargs)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py ADDED Viewed

	@@ -0,0 +1,678 @@

+import functools
+import logging
+import math
+import sys
+import typing
+from typing import Optional
+import torch
+import torch._decomp as decomp
+import torch._prims_common as utils
+import torch.ao.quantization.fx._decomposed
+from torch._decomp import (
+    core_aten_decompositions,
+    get_decompositions,
+    remove_decompositions,
+)
+from torch._decomp.decompositions import (
+    _grid_sampler_2d as decomp_grid_sampler_2d,
+    pw_cast_for_opmath,
+)
+from torch._decomp.decompositions_for_rng import extra_random_decomps
+from torch._higher_order_ops.out_dtype import out_dtype
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    type_to_dtype,
+)
+from . import config, inductor_prims
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+quantized_decomposed = torch.ops.quantized_decomposed
+inductor_decompositions = get_decompositions(
+    [
+        aten._adaptive_avg_pool2d_backward,
+        aten.arange,
+        aten.bitwise_and_,
+        aten.bitwise_or_,
+        aten.clamp_min_,
+        aten.dist,
+        aten.empty_like,
+        aten.flip,
+        aten.gelu,
+        aten.hardtanh,
+        aten.index_select,
+        aten.lcm,
+        aten.leaky_relu,
+        aten.linalg_vector_norm,
+        aten._log_softmax,
+        aten.max_pool2d_with_indices_backward,
+        aten._native_batch_norm_legit,
+        aten._native_batch_norm_legit_functional,
+        aten._native_batch_norm_legit_no_training,
+        aten.native_batch_norm,
+        aten.native_group_norm,
+        aten.native_layer_norm,
+        aten.nll_loss2d_backward,
+        aten._softmax,
+        aten.sin_,
+        aten.sqrt_,
+        out_dtype,
+        aten._to_copy,
+        aten.tril_indices,
+        aten.triu_indices,
+        aten.upsample_bilinear2d.vec,
+    ]
+)
+decompositions = {**core_aten_decompositions(), **inductor_decompositions}
+# Remove unwanted decompositions included via the core ATen decompositions from
+# the Inductor decomp table.
+decomps_to_exclude = [
+    aten._unsafe_index,
+    aten._scaled_dot_product_flash_attention_for_cpu.default,  # See comments in torch/_decomp/decompositions.py
+    aten.clamp_max,
+    aten.clamp_min,
+    aten.glu,  # inductor lowers this directly
+    aten.split.Tensor,  # inductor lowers this directly
+    aten.squeeze,  # inductor lowers this directly
+    aten.sum,  # inductor lowers this directly
+    aten.unbind,  # inductor lowers this directly
+]
+remove_decompositions(decompositions, decomps_to_exclude)
+def register_decomposition(ops):
+    for op in [ops] if callable(ops) else ops:
+        if op in decompositions:
+            log.warning("duplicate decomp: %s", ops)
+    return decomp.register_decomposition(ops, decompositions)
+# TODO: for now, inductor doesn't handle asserts
+# because the condition is symbool -> tensor in the graph.
+@register_decomposition([aten._assert_async.msg])
+def assert_async_msg_decomp(tensor, msg):
+    return
+# Following `assert_async_msg_decomp` and implement as non-op.
+@register_decomposition([aten._functional_assert_async.msg])
+def functional_assert_async_msg_decomp(tensor, msg):
+    return
+@register_decomposition([aten.sym_constrain_range_for_size.default])
+def sym_constrain_range_for_size(symbol, *, min=None, max=None):
+    return
+@register_decomposition([aten.clamp])
+@pw_cast_for_opmath
+def clamp(x, min=None, max=None):
+    if min is not None:
+        x = x.clamp_min(min)
+    if max is not None:
+        x = x.clamp_max(max)
+    return x
+@register_decomposition([aten.full])
+def full(size, fill_value, **kwargs):
+    dtype = kwargs.get("dtype")
+    if dtype is None:
+        kwargs["dtype"] = type_to_dtype(type(fill_value))
+        return aten.full(size, fill_value, **kwargs)
+    return NotImplemented
+# Not really sure how to put this into the main library.  PrimTorch wants
+# empty_permuted to go to the prim, and typically users don't really want
+# to decompose to empty_strided (but inductor is OK with it, because we are
+# cool with strides and everything goes to empty_strided)
+@register_decomposition([aten.empty_permuted.default])
+def empty_permuted(size, physical_layout, **kwargs):
+    perm = [0] * len(size)
+    for p, l in enumerate(physical_layout):
+        perm[l] = p
+    return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
+@register_decomposition([aten.convolution_backward])
+def convolution_backward(
+    grad_output,
+    input,
+    weight,
+    bias_sizes,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    output_mask,
+):
+    if not output_mask[2] or grad_output.device.type != "cuda":
+        return NotImplemented
+    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
+    grad_inp, grad_weight, _ = aten.convolution_backward(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        [output_mask[0], output_mask[1], False],
+    )
+    return (grad_inp, grad_weight, grad_bias)
+@register_decomposition([aten.log2])
+def log2(x):
+    return torch.log(x) * (1.0 / math.log(2.0))
+@register_decomposition([aten.round.decimals])
+def round_dec(x, decimals=0):
+    ten_pow_decimals = 10.0**decimals
+    return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
+@register_decomposition([aten.bmm])
+@pw_cast_for_opmath
+def bmm(self, batch2):
+    if config.coordinate_descent_tuning:
+        if self.shape[1] == 1 or batch2.shape[2] == 1:
+            out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
+            return out
+    if self.device.type == "cpu":
+        if self.size(1) == 1 and batch2.size(-1) == 1:
+            return torch.sum(
+                self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True
+            ).unsqueeze(1)
+    return NotImplemented
+@register_decomposition([aten.addmm])
+@pw_cast_for_opmath
+def addmm(self, mat1, mat2, beta=1, alpha=1):
+    if self.device.type == "cpu":
+        if mat1.size(0) == 1 and mat2.size(-1) == 1:
+            out = torch.sum(
+                mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+            return alpha * out + beta * self
+        if mat1.size(0) == 1 and mat2.size(0) <= 16 and mat2.size(1) <= 16:
+            out = (mat1.T * mat2).sum(dim=0, keepdim=True)
+            return alpha * out + beta * self
+    return NotImplemented
+@register_decomposition([aten.mm])
+@pw_cast_for_opmath
+def mm(self, input2):
+    from torch.fx.experimental.symbolic_shapes import (
+        definitely_true,
+        guard_size_oblivious,
+    )
+    # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
+    # todo: Look into why and fix it (hopefully)
+    if config.coordinate_descent_tuning:
+        if self.shape[0] == 1 or input2.shape[1] == 1:
+            return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
+    if self.device.type == "cpu":
+        if (
+            guard_size_oblivious(self.size(-1) == 1)
+            and guard_size_oblivious(self.size(0) > 0)
+            and guard_size_oblivious(input2.size(0) == 1)
+            and (self.dtype == input2.dtype)
+            and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
+        ):
+            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+        if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
+            input2.size(-1) == 1
+        ):
+            return torch.sum(
+                self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+    return NotImplemented
+# This pass does two things:
+# - Eliminate cat when there is only one tensor input
+# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
+#   don't remove ALL empty tensors, only the naughty ones)
+@register_decomposition([aten.cat.default])
+def cat(tensors, dim=0):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    def non_empty_tensor(x):
+        # For better or worse, this is a valid cat:
+        #
+        #   torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
+        #
+        # We'd like to eliminate naughtiness like this for downstream passes
+        # like split_cat.  The easiest way is to just drop such inputs
+        # (guarding that they are non-zero).
+        #
+        # Is it permissible for this filtering to be size-oblivious?  A case
+        # where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
+        # happened to be zero, we would have liked to have filtered it out.
+        # But actually, the ONLY way this could have passed is if u0 == 0,
+        # so by the time we get here we have already installed a deferred
+        # runtime assert forcing u0 to be zero.  So if this hasn't happened,
+        # we know that the unbacked SymInt has appropriate size and there are
+        # no problems.
+        return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0)
+    filtered_tensors = list(filter(non_empty_tensor, tensors))
+    if len(filtered_tensors) == 1:
+        return filtered_tensors[0].clone()
+    elif 1 < len(filtered_tensors) < len(tensors):
+        # on the first call, when we remove empty tensors, we redispatch recursively
+        return aten.cat.default(filtered_tensors, dim)
+    # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
+    return NotImplemented
+@register_decomposition([aten.angle])
+def angle(x):
+    if x.is_complex():
+        return torch.where(
+            torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real)
+        )
+    # when x is real number
+    #   if x >= 0, return 0
+    #   if x < 0, return pi
+    #   if x is nan, return nan
+    _, dtype = elementwise_dtypes(
+        x,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device)
+    ret = torch.where(x < 0, pi, 0.0)
+    return torch.where(torch.isnan(x), float("nan"), ret)
+@register_decomposition([aten.add])
+def add(x, y, *, alpha=None):
+    x_is_complex_tensor = torch.is_tensor(x) and x.is_complex()
+    y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
+    if not x_is_complex_tensor or not y_is_complex_tensor:
+        return NotImplemented
+    z = y
+    if alpha is not None:
+        z = alpha * y
+    complex_type = torch.promote_types(x.dtype, y.dtype)
+    return (x.view(x.real.dtype) + z.view(y.real.dtype)).view(complex_type)
+@register_decomposition([aten.conj_physical])
+def conj_physical(self):
+    assert not self.is_complex(), "TODO: implement this"
+    return self
+@register_decomposition([aten.lift, aten.detach_])
+def lift(self):
+    return self
+@register_decomposition([aten.bernoulli.default])
+def bernoulli(self, *, generator=None):
+    assert generator is None
+    return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
+@register_decomposition([aten.fmin, prims.fmin])
+def fmin(self, other):
+    return torch.where(torch.isnan(other) | (other > self), self, other)
+@register_decomposition([aten.fmax, prims.fmax])
+def fmax(self, other):
+    return torch.where(torch.isnan(other) | (other < self), self, other)
+@register_decomposition(aten.amax)
+def amax(self, dim=None, keepdim=False):
+    if self.dtype == torch.bool:
+        return torch.any(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+@register_decomposition(aten.amin)
+def amin(self, dim=None, keepdim=False):
+    if self.dtype == torch.bool:
+        return torch.all(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+@register_decomposition([aten.narrow_copy])
+def narrow_copy(self, dim, start, length):
+    return torch.narrow(self, dim, start, length).clone()
+@register_decomposition([aten.expand_copy])
+def expand_copy(self, size, *, implicit=False):
+    return aten.expand(self, size, implicit=implicit).clone()
+@register_decomposition([aten.view_copy.default])
+def view_copy_default(self, size):
+    return aten.view(self, size).clone()
+@register_decomposition([aten.view_copy.dtype])
+def view_copy_dtype(self, dtype):
+    return self.to(dtype).clone()
+def get_like_layout(
+    tensor: torch.Tensor, memory_format: Optional[torch.memory_format]
+) -> torch.memory_format:
+    # TODO: _to_copy tensor to stride permutation
+    if memory_format is torch.preserve_format or memory_format is None:
+        return utils.suggest_memory_format(tensor)
+    else:
+        return memory_format
+@register_decomposition(aten.rand_like)
+def rand_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return torch.rand(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randn_like)
+def randn_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return torch.randn(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.full_like)
+def full_like(
+    self,
+    fill_value,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    requires_grad=False,
+    memory_format=torch.preserve_format,
+):
+    return torch.full(
+        [*self.size()],
+        fill_value,
+        dtype=dtype or self.dtype,
+        layout=layout or self.layout,
+        device=device or self.device,
+        requires_grad=requires_grad,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint_like.default)
+def randint_like(self, high, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return aten.randint.low(
+        0,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint_like.low_dtype)
+def randint_like_low(
+    self, low, high, *, dtype=None, device=None, memory_format=None, **kwargs
+):
+    return aten.randint.low(
+        low,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint.default)
+def randint(high, size, **kwargs):
+    return aten.randint.low(0, high, size, **kwargs)
+# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is
+# scale and zero_point is scalar or scalar tensor
+@register_decomposition(quantized_decomposed.quantize_per_tensor.default)
+def quantize_per_tensor_default_decomp_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    inv_scale = 1.0 / scale
+    return torch.clamp(
+        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
+    ).to(dtype)
+# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is
+# scale and zero_point is scalar or scalar tensor
+@register_decomposition(quantized_decomposed.dequantize_per_tensor.default)
+def dequantize_per_tensor_default_decomp_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return (input.to(torch.float32) - zero_point) * scale
+@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor)
+def quantize_per_tensor_tensor_decomp_impl(
+    input: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    inv_scale = 1.0 / scale
+    return torch.clamp(
+        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
+    ).to(dtype)
+@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor)
+def dequantize_per_tensor_tensor_decomp_impl(
+    input: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return (input.to(torch.float32) - zero_point.to(torch.int32)) * scale.to(
+        torch.float32
+    )
+@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
+def q_embedding_bag_byte_unpack_decomp(packed):
+    def bitcast_u8_to_f32(u8):
+        x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3))
+        if sys.byteorder == "little":
+            return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None]
+        else:
+            return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None]
+    scales = bitcast_u8_to_f32(packed[..., -8:-4])
+    offsets = bitcast_u8_to_f32(packed[..., -4:])
+    return packed[..., :-8].to(torch.float32) * scales + offsets
+@register_decomposition([aten.grid_sampler_2d])
+@pw_cast_for_opmath
+def grid_sampler_2d(
+    a: torch.Tensor,
+    grid: torch.Tensor,
+    interpolation_mode: int = 0,
+    padding_mode: int = 0,
+    align_corners: bool = False,
+) -> torch.Tensor:
+    # We do not expand the grid (_expand_grid=False) on cpu for performance reasons
+    # Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
+    # and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
+    # However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
+    # Thus we apply this hack to not expand the grid for this case.
+    _expand_grid = not (
+        a.device == torch.device("cpu")
+        and interpolation_mode == 0
+        and a.is_contiguous(memory_format=torch.contiguous_format)
+    )
+    output = decomp_grid_sampler_2d(
+        a,
+        grid=grid,
+        interpolation_mode=interpolation_mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+        _expand_grid=_expand_grid,
+    )
+    return output
+@register_decomposition(aten._foreach_addcmul.Scalar)
+def _foreach_addcmul_scalar(self, left_tensors, right_tensors, scalar=1):
+    return aten._foreach_add.List(
+        self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar
+    )
+@register_decomposition(aten._foreach_addcdiv.Scalar)
+def _foreach_addcdiv_scalar(self, left_tensors, right_tensors, scalar=1):
+    return aten._foreach_add.List(
+        self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar
+    )
+@register_decomposition(aten._foreach_lerp.Scalar)
+def _foreach_lerp_scalar(start_tensors, end_tensors, weight):
+    return aten._foreach_add.List(
+        start_tensors,
+        aten._foreach_mul.Scalar(
+            aten._foreach_sub.List(end_tensors, start_tensors), weight
+        ),
+    )
+@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
+@register_decomposition(aten.miopen_batch_norm)
+def miopen_batch_norm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    running_mean: typing.Optional[torch.Tensor],
+    running_var: typing.Optional[torch.Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+):
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+    if training:
+        return (a, b, c)
+    return (
+        a,
+        weight.new_zeros((0,)),
+        weight.new_zeros((0,)),
+    )
+@functools.lru_cache(None)
+def fast_random_decomps():
+    return {**decompositions, **extra_random_decomps}
+def select_decomp_table():
+    """decomps can change based on config"""
+    if config.fallback_random:
+        return decompositions
+    return fast_random_decomps()
+@register_decomposition(aten.masked_scatter)
+def masked_scatter(self, mask, source):
+    if self.device.type == "cuda":
+        # This two-step algorithm is the same as eager CUDA, for eager CPU we
+        # use a 1-shot serial iteration.
+        self, mask = aten.broadcast_tensors([self, mask])
+        source_idx = mask.reshape(-1).cumsum(0) - 1
+        return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source)
+    return NotImplemented
+@register_decomposition(quantized_decomposed.choose_qparams.tensor)
+def choose_qparams_tensor(
+    input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype
+):
+    min_val, max_val = torch.aminmax(input)
+    scale = (max_val - min_val) / float(quant_max - quant_min)
+    scale = torch.max(scale, torch.Tensor([eps]))
+    zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
+    zero_point = torch.clamp(zero_point, quant_min, quant_max)
+    return scale.to(torch.float64), zero_point.to(torch.int64)
+@register_decomposition(aten.put)
+def put(self, index, source, accumulate=False):
+    flattened = self.flatten()
+    flattened = torch.index_put(
+        flattened, [index], source.reshape(index.shape), accumulate
+    )
+    return flattened.reshape(self.shape)
+@register_decomposition(aten.put_)
+def put_(self, index, source, accumulate=False):
+    out = aten.put(self, index, source, accumulate=accumulate)
+    return self.copy_(out)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/test_operators.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch.library
+from torch import Tensor
+from torch.autograd import Function
+_test_lib_def = torch.library.Library("_inductor_test", "DEF")
+_test_lib_def.define("realize(Tensor self) -> Tensor", tags=torch.Tag.pt2_compliant_tag)
+_test_lib_impl = torch.library.Library("_inductor_test", "IMPL")
+for dispatch_key in ("CPU", "CUDA", "Meta"):
+    _test_lib_impl.impl("realize", lambda x: x.clone(), dispatch_key)
+class Realize(Function):
+    @staticmethod
+    def forward(ctx, x):
+        return torch.ops._inductor_test.realize(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+def realize(x: Tensor) -> Tensor:
+    return Realize.apply(x)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/virtualized.py ADDED Viewed

	@@ -0,0 +1,351 @@

+"""
+This file provides a number of "global" variables/handlers that are actually
+thread local and dynamically scoped, with Inductor patching them to various
+implementations depending on the situation.
+These handlers are interacted with in a fairly stylized way.  Typically,
+we will import V from this module::
+    from .virtualized import V
+Various handlers are accessible as attributes on this module; for example,
+you might access ``V.graph.sizevars.size_hint`` to resolve a size hint associated with
+a number.
+There are a few distinct usage patterns for virtualized global variables:
+1. Implicit argument passing.  Examples: ``V.current_node``, ``V.aot_compilation``.
+   Use ``V.set_current_node`` to change what the current node is while we're
+   executing some region of code, so code inside that region can query ``V.current_node``
+   to find out what it is.  This is often more convenient than manually threading
+   the current node as an argument through all call stacks.
+2. Per-compilation global state.  Examples: ``V.fake_mode``, ``V.graph``.  For a
+   given ``compile_fx`` invocation, these typically don't change, but they are
+   associated with some internal state so they cannot just be global functions.
+   We install these objects at the beginning of compilation and then you can
+   conveniently access them without having to pass them around.
+3. Alternate define-by-run interpretations.  Examples: ``V.ops``, ``V.kernel``.
+   A commonly used IR in Inductor is define-by-run: instead of maintaining
+   explicit syntax data structures, we instead represent loop bodies as
+   callable functions, which internally invoke operations defined on
+   ``V.ops``.  To perform semantic analysis, print or code generate these
+   operations, we dynamically patch ``V.ops`` with an alternate handler with
+   the intended semantics and then run the callable function.  For example, to
+   extract out a traditional (FX) graph representation of the define-by-run
+   IR, simply install a handler that records each ``ops`` call to a graph.
+   TODO: Define a parent class / protocol that defines all of the operations
+   V.ops is expected to support.
+It is typically an error to access a virtualized global without having installed
+an appropriate handler (you will get a NullHandler), although in some cases we
+provide a default implementation.
+One last thing: although most virtualized globals are accessed via ``V``, ``ops`` is
+ubiquitous enough to have its own top level variable, so you will typically see
+``ops.constant(...)`` rather than ``V.ops.constant(...)``.  In fact, these are not
+equivalent; the former interface supports arithmetic overloads like ``x + y``
+instead of forcing ``ops.add(x, y)``, so it should be preferred.
+Some operators are seemingly unused, but they are implicitly used by ops_wrapper.
+In particular, we typically have an operator for every basic pointwise PyTorch operation
+supported.
+"""
+from __future__ import annotations
+from contextlib import AbstractContextManager, contextmanager
+from threading import local
+from typing import Any, Callable, Generic, List, Type, TYPE_CHECKING, TypeVar, Union
+from .ops_handler import (  # noqa: F401
+    KernelFormatterHandler,
+    MockHandler,
+    OpsHandler,
+    ReductionType,
+    StoreMode,
+    WrapperHandler,
+)
+if TYPE_CHECKING:
+    import torch
+    from torch._inductor.debug import DebugContext
+    from torch._inductor.graph import GraphLowering
+    from torch._inductor.ir import InterpreterShim
+    from torch._subclasses import FakeTensorMode
+threadlocal = local()
+T = TypeVar("T")
+class NullHandler:
+    """
+    Sentinel indicating that a global variable is unset ala None.  Typically,
+    attempting to access the global variable before it's set is an error, but with
+    NullHandler it won't fail until you try to access an attribute on it.
+    """
+    pass
+class Virtualized(Generic[T]):
+    """
+    Implements a global variable that redirects via thread local variable
+    (NB: construct this class to create the global variable; this is not
+    a singleton class!)
+    This allows us to swap in different op implementations in codegen.
+    NB: Despite the fact that we typically call these "handlers" (e.g., NullHandler is
+    the default value of the variable), we sometimes use these variables to
+    store other things, like booleans.
+    """
+    def __init__(self, vname: str, default: Union[Callable[[], T], Type[NullHandler]]):
+        self._key: str = f"__torchinductor_{vname}"
+        self._default = default
+    def _set_handler(self, value: T) -> AbstractContextManager[None]:
+        prior = self._get_handler()
+        setattr(threadlocal, self._key, value)
+        @contextmanager
+        def ctx():
+            try:
+                yield
+            finally:
+                self._set_handler(prior)
+        return ctx()
+    def _get_handler(self) -> T:
+        try:
+            return getattr(threadlocal, self._key)
+        except AttributeError:
+            # TODO: To be honest, I feel we probably should just error in this
+            # case, instead of making a null handler that will probably error
+            # when you getattr on it
+            return self._default()  # type: ignore[return-value]
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._get_handler(), name)
+class NullKernelHandler(NullHandler):
+    """
+    We need access `V.kernel.removed_buffers` in DeferredLine class when there
+    is no kernel in the context. This happens when codegening the wrapper.
+    Initialize `removed_buffers` and `inplaced_to_remove` explicitly so we don't
+    need call 'getattr' with default value which is error prone to typo in
+    attribute name.
+    """
+    def __init__(self):
+        super().__init__()
+        self.removed_buffers = set()
+        self.inplaced_to_remove = set()
+        self.index_dtype = "tl.int64"
+_ops: Virtualized[OpsHandler[Any]] = Virtualized("ops", MockHandler)
+_graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler)
+_real_inputs: Virtualized[List[torch.Tensor]] = Virtualized("real_inputs", NullHandler)
+_fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler)
+_kernel: Virtualized[NullKernelHandler] = Virtualized(
+    "kernel", NullKernelHandler
+)  # TODO: improve type
+_debug: Virtualized[DebugContext] = Virtualized("debug", NullHandler)
+_interpreter: Virtualized[InterpreterShim] = Virtualized("interpreter", NullHandler)
+_aot_compilation: Virtualized[bool] = Virtualized("aot_compilation", NullHandler)
+_current_node: Virtualized[torch.fx.Node] = Virtualized("current_node", NullHandler)
+class OpsValue:
+    """The return type of most ops calls.
+    This exists so we can overload magic methods, and write mathematical
+    expressions much more fluently. So instead of
+        ops.add(ops.mul(ops.mul(ops.sub(ops.mul(_Ap2, x), _Ap3), x), x), _1)
+    we can write
+        (_Ap2 * x - _Ap3) * x * x + _1
+    """
+    value: Any
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return str(self.value)
+    def __repr__(self):
+        return f"OpsValue({self.value!r})"
+    def __add__(self, other):
+        return ops.add(self, other)
+    def __mul__(self, other):
+        return ops.mul(self, other)
+    def __sub__(self, other):
+        return ops.sub(self, other)
+    def __neg__(self):
+        return ops.neg(self)
+    def __truediv__(self, other):
+        return ops.truediv(self, other)
+    def __floordiv__(self, other):
+        return ops.floordiv(self, other)
+    def __mod__(self, other):
+        return ops.mod(self, other)
+    def __pow__(self, other):
+        return ops.pow(self, other)
+    def __lt__(self, other):
+        return ops.lt(self, other)
+    def __le__(self, other):
+        return ops.le(self, other)
+    def __eq__(self, other):
+        return ops.eq(self, other)
+    def __ne__(self, other):
+        return ops.ne(self, other)
+    def __gt__(self, other):
+        return ops.gt(self, other)
+    def __ge__(self, other):
+        return ops.ge(self, other)
+    def __and__(self, other):
+        return ops.bitwise_and(self, other)
+    def __or__(self, other):
+        return ops.bitwise_or(self, other)
+    def __xor__(self, other):
+        return ops.bitwise_xor(self, other)
+    def __invert__(self):
+        return ops.bitwise_not(self)
+    def __rshfit__(self, n):
+        return ops.bitwise_right_shift(self, n)
+    def __lshift__(self, n):
+        return ops.bitwise_left_shift(self, n)
+class OpsWrapper:
+    """This wraps any returned IR values into an `OpsValue` instance, so that we
+    can overload the magic methods for writing mathematical expressions fluently.
+    """
+    def __getattr__(self, name):
+        def inner(*args, **kwargs):
+            new_args = [OpsWrapper._unwrap(a) for a in args]
+            new_kwargs = {k: OpsWrapper._unwrap(v) for k, v in kwargs.items()}
+            return OpsWrapper._wrap(getattr(_ops, name)(*new_args, **new_kwargs))
+        return inner
+    @staticmethod
+    def _unwrap(x):
+        if isinstance(x, (list, tuple)):
+            return tuple(OpsWrapper._unwrap(v) for v in x)
+        if isinstance(x, OpsValue):
+            return x.value
+        return x
+    @staticmethod
+    def _wrap(x):
+        if isinstance(x, (list, tuple)):
+            return tuple(OpsValue(v) for v in x)
+        return OpsValue(x)
+    @staticmethod
+    def indirect_indexing(index, size, check=True):
+        # Returns a sympy value, not IR value
+        index = OpsWrapper._unwrap(index)
+        return _ops.indirect_indexing(index, size, check)
+ops = OpsWrapper()
+class _V:
+    MockHandler = MockHandler
+    KernelFormatterHandler = KernelFormatterHandler
+    WrapperHandler = WrapperHandler
+    set_ops_handler: Callable[[Any], Any] = _ops._set_handler
+    get_ops_handler: Callable[[], Any] = _ops._get_handler
+    set_graph_handler: Callable[[GraphLowering], Any] = _graph._set_handler
+    set_real_inputs: Callable[[Any], Any] = _real_inputs._set_handler
+    get_real_inputs: Callable[[], Any] = _real_inputs._get_handler
+    set_fake_mode: Callable[[Any], Any] = _fake_mode._set_handler
+    get_fake_mode: Callable[[], Any] = _fake_mode._get_handler
+    set_kernel_handler: Callable[[Any], Any] = _kernel._set_handler
+    set_debug_handler: Callable[[Any], Any] = _debug._set_handler
+    set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler
+    set_aot_compilation: Callable[[bool], Any] = _aot_compilation._set_handler
+    get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler
+    set_current_node: Callable[[Any], Any] = _current_node._set_handler
+    get_current_node: Callable[[], Any] = _current_node._get_handler
+    @property
+    def ops(self) -> OpsHandler[Any]:
+        """The operator handler specific to the current codegen task"""
+        return _ops._get_handler()
+    @property
+    def graph(self) -> GraphLowering:
+        """The graph currently being generated"""
+        return _graph._get_handler()
+    @property
+    def real_inputs(self):
+        """non-fake example inputs"""
+        return _real_inputs._get_handler()
+    @property
+    def fake_mode(self):
+        """The graph currently being generated"""
+        return _fake_mode._get_handler()
+    @property
+    def kernel(self):
+        """The kernel currently being generated"""
+        return _kernel._get_handler()
+    @property
+    def debug(self):
+        return _debug._get_handler()
+    @property
+    def interpreter(self):
+        return _interpreter._get_handler()
+    @property
+    def aot_compilation(self):
+        return _aot_compilation._get_handler()
+    @property
+    def current_node(self):
+        return _current_node._get_handler()
+V = _V()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h ADDED Viewed

	@@ -0,0 +1,98 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+class TensorBase;
+}
+namespace at::native {
+// These constants control the approximation behavior of gelu function.
+enum class GeluType {
+  None,             // Baseline Gelu
+  Tanh,             // Tahn Gelu Approximation
+  END
+};
+static GeluType get_gelutype_enum(const c10::string_view approximate) {
+  if (approximate == "none") {
+    return GeluType::None;
+  } else if (approximate == "tanh") {
+    return GeluType::Tanh;
+  } else {
+    TORCH_CHECK(false, "approximate argument must be either none or tanh.");
+  }
+}
+static std::string gelutype_to_string(const GeluType type) {
+  switch(type) {
+    case GeluType::None: return "none";
+    case GeluType::Tanh: return "tanh";
+    default: TORCH_CHECK(false, "unknown GELU type: ", static_cast<int>(type));
+  }
+}
+using structured_activation_fn = void (*)(TensorIteratorBase&);
+using structured_activation_backward_fn = void (*)(TensorIteratorBase&);
+using activation_fn = void (*)(TensorIterator&);
+using activation_backward_fn = void (*)(TensorIterator&);
+using softplus_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using softplus_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using threshold_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using hardtanh_backward_fn = void (*)(TensorIterator&, const c10::Scalar&, const c10::Scalar&);
+using hardsigmoid_fn = void(*)(TensorIteratorBase&);
+using hardsigmoid_backward_fn = void(*)(TensorIteratorBase&);
+using hardswish_fn = void(*)(TensorIterator&);
+using hardswish_backward_fn = void(*)(TensorIterator&);
+using shrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using softshrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using shrink_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using elu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&);
+using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&, bool);
+using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&);
+using gelu_fn = void (*)(TensorIteratorBase&, GeluType);
+using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType);
+using glu_jvp_fn = void (*)(TensorIteratorBase&);
+DECLARE_DISPATCH(elu_fn, elu_stub);
+DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub);
+DECLARE_DISPATCH(softplus_fn, softplus_stub);
+DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub);
+DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub);
+DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub);
+DECLARE_DISPATCH(threshold_fn, threshold_stub);
+DECLARE_DISPATCH(gelu_fn, GeluKernel);
+DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel);
+DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub);
+DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub);
+DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub);
+DECLARE_DISPATCH(hardswish_fn, hardswish_stub);
+DECLARE_DISPATCH(hardswish_backward_fn, hardswish_backward_stub);
+DECLARE_DISPATCH(shrink_fn, hardshrink_stub);
+DECLARE_DISPATCH(softshrink_fn, softshrink_stub);
+DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub);
+DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub);
+DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, glu_stub);
+DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
+DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub);
+DECLARE_DISPATCH(structured_activation_fn, silu_stub);
+DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, mish_stub);
+DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);
+DECLARE_DISPATCH(activation_fn, prelu_stub);
+DECLARE_DISPATCH(activation_backward_fn, prelu_backward_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <cmath>
+namespace at::native {
+using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);
+using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel);
+static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+static inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+static inline void adaptive_pool_empty_output_check(const Tensor& gradOutput_, const char* arg_name) {
+  int64_t ndim = gradOutput_.ndimension();
+  for (const auto i : c10::irange(1, ndim)) {
+    TORCH_CHECK(gradOutput_.size(i) > 0,
+      arg_name, "(): Expected grad_output to have non-zero size for non-batch dimensions, "
+      "but grad_output has sizes ", gradOutput_.sizes(), " with dimension ", i,
+      " being empty");
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h ADDED Viewed

	@@ -0,0 +1,173 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/ScalarOps.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type.h>
+#endif
+namespace at::native {
+// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to
+// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not
+// match, will change them to be a common super type so comparisons are done between the same types.
+// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the
+// corresponding raw_* version should be used since it was already contiguous of the right type.
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    Tensor& trimmed_sorter,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries,
+    const Tensor& raw_sorter) {
+  bool in_is_contiguous = raw_input.is_contiguous();
+  bool bd_is_contiguous = raw_boundaries.is_contiguous();
+  bool sort_is_contiguous = raw_sorter.is_contiguous();
+  if (!in_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_input = raw_input.contiguous();
+  }
+  if (!bd_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_boundaries = raw_boundaries.contiguous();
+  }
+  if (!sort_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_sorter = raw_sorter.contiguous();
+  }
+  if (raw_input.dtype() != raw_boundaries.dtype()) {
+    at::native::ResultTypeState state = {};
+    state = at::native::update_result_type_state(raw_boundaries, state);
+    state = at::native::update_result_type_state(raw_input, state);
+    ScalarType common_stype = at::native::result_type(state);
+    TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined);
+    if (common_stype != raw_input.scalar_type()) {
+      trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype);
+    }
+    if (common_stype != raw_boundaries.scalar_type()) {
+      trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype);
+    }
+  }
+}
+/* unused but needed for internal jagged tensor class */
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries) {
+  Tensor trimmed_sorter;
+  Tensor raw_sorter;
+  return searchsorted_maybe_trim_input_tensors(
+      trimmed_input,
+      trimmed_boundaries,
+      trimmed_sorter,
+      raw_input,
+      raw_boundaries,
+      raw_sorter);
+}
+inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) {
+  if (boundaries.dim() != input.dim()) {
+    return false;
+  }
+  const auto& dims_bd = boundaries.sizes();
+  const auto& dims_in = input.sizes();
+  for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) {
+    if (dims_bd[dim] != dims_in[dim]) {
+      return false;
+    }
+  }
+  return true;
+}
+inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) {
+  auto tensor = c10::scalar_to_tensor(scalar, device);
+  // This is to adopt the scalar promotion rules defined in native/TypeProperties.h
+  // So we have the same type promotion rules as binary operations.
+  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+  return tensor;
+}
+inline void searchsorted_pre_check(
+    const Tensor& boundaries,
+    const Tensor& input,
+    const Tensor& output,
+    const bool out_int32,
+    const bool right,
+    const c10::optional<c10::string_view> side_opt,
+    const Tensor& sorter) {
+  if (side_opt) {
+    const c10::string_view side = *side_opt;
+    TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ",
+      "got ", side);
+    // assume the user has not explicitly set (right=False, side="right")
+    TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side "
+    "of ", side, " while right was True");
+  }
+  TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ",
+    "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ",
+    "tensor device type ", input.device());
+  if (sorter.defined()) {
+    TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ",
+      "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ",
+      "device type ", boundaries.device());
+    TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same "
+      "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes());
+    TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
+      "dtype but got dtype ", sorter.scalar_type());
+    if (sorter.numel() > 0) {
+      auto minmax = sorter.aminmax();
+      int64_t vmin = std::get<0>(minmax).item().toLong();
+      int64_t vmax = std::get<1>(minmax).item().toLong();
+      TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
+    }
+  }
+  TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),
+    "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ",
+    "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(",
+    input.numel(), ")");
+  TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ",
+    "got 0 dimension");
+  TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input),
+    "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ",
+    "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ",
+    input.sizes());
+  ScalarType output_dtype = output.scalar_type();
+  TORCH_CHECK(
+      (output_dtype == ScalarType::Long && !out_int32) ||
+          (output_dtype == ScalarType::Int && out_int32),
+      "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ",
+      "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype,
+      " and out_int32 flag is ", (out_int32 ? "True" : "False"));
+  if (out_int32) {
+    TORCH_CHECK(boundaries.sizes().back() < INT_MAX,
+      "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ",
+      boundaries.sizes().back());
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h ADDED Viewed

	@@ -0,0 +1,446 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+namespace at::native {
+using conv_depthwise2d_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 2>);
+DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub);
+using conv_depthwise3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub);
+using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
+using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
+using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
+using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
+using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
+using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
+using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
+using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional<Tensor>&,
+    IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
+using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
+using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub);
+using slow_conv_dilated3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub);
+using slow_conv_transpose2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub);
+using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
+namespace {
+  static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
+}
+static inline bool cudnnv8_enabled_check_debug() {
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
+  static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
+  static uint8_t cudnnv8_debugcount = 0;
+  if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
+    cudnnv8_debugcount++;
+  }
+  return cudnnv8_flag == 1;
+}
+static inline bool cudnnv8_use_heur_mode_b() {
+  return cudnnv8_heuristic_mode_b;
+}
+// Keep in sync with py::enum_ in Module.cpp
+enum class ConvBackend {
+  CudaDepthwise2d,
+  CudaDepthwise3d,
+  Cudnn,
+  CudnnTranspose,
+  Empty,
+  Miopen,
+  MiopenDepthwise,
+  MiopenTranspose,
+  Mkldnn,
+  MkldnnTranspose,
+  MkldnnEmpty,
+  NnpackSpatial,
+  Overrideable,
+  Slow2d,
+  Slow3d,
+  SlowDilated2d,
+  SlowDilated3d,
+  SlowTranspose2d,
+  SlowTranspose3d,
+  Winograd3x3Depthwise,
+  Xnnpack2d,
+  Mps,
+  MpsTranspose,
+};
+// Overload for selecting the convolution backend from the full set of convolution inputs.
+// This overload is exposed to python for testing, etc.
+TORCH_API ConvBackend select_conv_backend(
+    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
+TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
+    const Tensor& weight,
+    const ConvBackend backend);
+// ---------------------------------------------------------------------
+//
+// Math
+//
+// ---------------------------------------------------------------------
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
+{
+  TORCH_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  TORCH_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    AT_ERROR(ss.str());
+  }
+}
+// NOTE [ Convolution checks ]
+//
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+static void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups);
+  // Weight
+  checkSameDim(c, input, weight);
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+// NB: conv_output_size and conv_input_size are not bijections,
+// as conv_output_size loses information; this is why conv_input_size
+// takes an extra output_padding argument to resolve the ambiguity.
+template <typename T>
+static inline std::vector<T> _conv_output_size(
+    ArrayRef<T> input_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> stride, ArrayRef<T> dilation = ArrayRef<T>()
+) {
+  // ASSERT(input_size.size() > 2)
+  // ASSERT(input_size.size() == weight_size.size())
+  bool has_dilation = !dilation.empty();
+  auto dim = input_size.size();
+  std::vector<T> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (const auto d : c10::irange(2, dim)) {
+    auto dilation_ = has_dilation ? dilation[d - 2] : 1;
+    auto kernel = dilation_ * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+static inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+static inline std::vector<c10::SymInt> conv_output_size(
+    SymIntArrayRef input_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+template <typename T>
+std::vector<T> _conv_input_size(
+    ArrayRef<T> output_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, ArrayRef<T> stride, ArrayRef<T> dilation, T groups
+) {
+  // ASSERT(output_size.size() > 2)
+  // ASSERT(output_size.size() == weight_size.size())
+  auto dim = output_size.size();
+  std::vector<T> input_size(dim);
+  input_size[0] = output_size[output_batch_size_dim];
+  input_size[1] = weight_size[weight_input_channels_dim] * groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
+                     kernel + output_padding[d - 2];
+  }
+  return input_size;
+}
+static inline std::vector<c10::SymInt> conv_input_size(
+    SymIntArrayRef output_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+static inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+template <typename T>
+std::vector<T> _conv_weight_size(
+    ArrayRef<T> input_size, ArrayRef<T> output_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  auto dim = input_size.size();
+  std::vector<T> weight_size(dim);
+  weight_size[0] = output_size[1];
+  weight_size[1] = input_size[1] / groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + padding[d - 2] * 2 - output_padding[d - 2];
+    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
+  }
+  return weight_size;
+}
+static inline std::vector<c10::SymInt> conv_weight_size(
+    SymIntArrayRef input_size, SymIntArrayRef output_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+static inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
+  std::vector<int64_t> shape(dim, 1);
+  shape[1] = -1;
+  return bias.reshape(shape);
+}
+static inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithCuDNN() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return at::MemoryFormat::Contiguous;
+  }
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  auto weight_ndim = weight.ndimension();
+  bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
+  if (can_use_cudnn_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
+  bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+  );
+  if (can_use_cudnn_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
+  return at::MemoryFormat::Contiguous;
+}
+// controls whether emptyCache will be called following cudnn conv benchmarking
+TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
+TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
+static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+  bool can_use_miopen_channels_last_2d = false;
+#if defined(USE_ROCM) && (ROCM_VERSION >= 40300)
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
+            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
+        );
+#endif
+  bool can_use_miopen_channels_last_3d = false;
+  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+}
+static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+  // disable NHWC for MkldnnCPU tensor.
+  if (input.is_mkldnn() || weight.is_mkldnn()) {
+    return false;
+  }
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  bool can_use_mkldnn_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+  bool can_use_mkldnn_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+  return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d;
+}
+static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) || (
+       weight_memory_format == at::MemoryFormat::ChannelsLast));
+  return can_use_thnn_channels_last_2d;
+}
+static inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  // check layout only for xpu tensor.
+  if (!input.is_xpu() || !weight.is_xpu()) {
+    return false;
+  }
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  bool can_use_xpu_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+  bool can_use_xpu_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+  return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d;
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace at {
+class Tensor;
+namespace native {
+using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d);
+DECLARE_DISPATCH(cross_fn, cross_stub);
+}} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h ADDED Viewed

	@@ -0,0 +1,394 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/Generator.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/Tensor.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/util/Optional.h>
+#include <limits>
+#include <cmath>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/view_as_real.h>
+#endif
+namespace at::native::templates {
+// ==================================================== Random ========================================================
+// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`.
+// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t).
+// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance:
+//
+//    auto actual = torch::empty({3, 3}, torch::half);
+//    actual.random_(0, 65504);
+//
+// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
+// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
+// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
+// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
+// available number for torch::half dtype.
+template<typename scalar_t>
+int64_t update_from(int64_t from) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto from_plus_1 = static_cast<int64_t>(static_cast<scalar_t>(from + 1));
+  if (from_plus_1 < from) {
+    int64_t from_ = std::abs(from + 1);
+    int n = 0;
+    while (from_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    from = from_plus_1 + (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return from;
+}
+template<typename scalar_t>
+int64_t update_to(int64_t to) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto to_minus_1 = static_cast<int64_t>(static_cast<scalar_t>(to - 1));
+  if (to_minus_1 >= to) {
+    int64_t to_ = std::abs(to - 1);
+    int n = 0;
+    while (to_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    to = to_minus_1 - (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return to;
+}
+// Return earlier for not invoking kernel.
+// See https://github.com/pytorch/pytorch/issues/103418 for more details
+#define CHECK_EMPTY_AND_RETURN(tensor) \
+  if (tensor.numel() == 0) {  \
+    return tensor;  \
+  }
+template<template<typename> class random_kernel, typename RNG>
+at::Tensor& random_impl(at::Tensor& self, c10::optional<Generator> generator) {
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  random_kernel<RNG>()(iter, generator);
+  return self;
+}
+#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \
+  TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \
+#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \
+  if (var < -(1LL << digits) || var > (1LL << digits)) { \
+    TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \
+      "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \
+      "This warning will become an error in version 1.7 release, please fix the code in advance"); \
+  }
+static void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) {
+  const auto scalar_type = typeMetaToScalarType(dtype);
+  if (isFloatingType(scalar_type)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] {
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+      constexpr auto digits = std::numeric_limits<scalar_t>::digits;
+      WARN_OUT_OF_BOUNDS(from, "from", digits, dtype);
+      WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype);
+    });
+  } else if (scalar_type == kUInt64) {
+    // When you do a comparison between int64_t and uint64_t, the usual
+    // arithmetic conversions say that the int64_t value is promoted to
+    // unsigned. But this conversion wraps around: if I had -1 as my int64_t,
+    // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never
+    // the right thing to do.
+    CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype);
+    CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype);
+  } else if (isIntegralType(scalar_type, /*includeBool=*/true)) {
+    AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() {
+      const auto min = static_cast<int64_t>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+    }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool);
+  } else {
+    TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types");
+  }
+}
+template<template<typename> class random_from_to_kernel, typename RNG>
+at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> generator) {
+  uint64_t range = 0;
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  if (to_opt.has_value()) {
+    // [from, to)
+    int64_t to = *to_opt;
+    TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to);
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] {
+        from = update_from<scalar_t>(from);
+        to = update_to<scalar_t>(to);
+        TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to);
+      });
+    }
+    check_from_to_in_range(from, to - 1, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to) - static_cast<uint64_t>(from);
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else if (from != std::numeric_limits<int64_t>::lowest()) {
+    // [from, std::numeric_limits<int64_t>::max()]
+    int64_t to_inc = 0;
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] {
+        constexpr int64_t scalar_t_max = static_cast<int64_t>(1) << std::numeric_limits<scalar_t>::digits;
+        to_inc = scalar_t_max > std::numeric_limits<int64_t>::max() ? std::numeric_limits<int64_t>::max() : static_cast<int64_t>(scalar_t_max);
+        from = update_from<scalar_t>(from);
+        TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc);
+      });
+    } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+      AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] {
+        if constexpr (std::is_same_v<scalar_t, bool>) {
+          to_inc = static_cast<int64_t>(true);
+        } else {
+          to_inc = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+        }
+      }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool);
+    } else {
+      TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types");
+    }
+    check_from_to_in_range(from, to_inc, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to_inc) - static_cast<uint64_t>(from) + 1;
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else {
+    // [std::numeric_limits<int64_t>::lowest(), std::numeric_limits<int64_t>::max()]
+    // range = 2^64
+    CHECK_EMPTY_AND_RETURN(self);
+    random_from_to_kernel<RNG>()(iter, generator);
+  }
+  return self;
+}
+// ==================================================== Normal ========================================================
+#define CHECK_NORMAL_TENSOR_STD(std) \
+  do { \
+    TORCH_CHECK( \
+      !std.is_complex(), \
+      "normal expects standard deviation to be non-complex"); \
+    TORCH_CHECK( \
+      std.numel() == 0 || std.is_meta() || std.min().ge(0).item<bool>(), \
+      "normal expects all elements of std >= 0.0"); \
+  } while (0)
+#define CHECK_NORMAL_STD(std) \
+  TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  CHECK_EMPTY_AND_RETURN(self);
+  if (self.is_complex()) {
+    auto float_tensor = at::view_as_real(self);
+    // variance for normal distribution of the real and imaginary values
+    // is half of the input variance
+    normal_kernel<RNG>()(float_tensor, mean, std/(std::sqrt(2)), gen);
+  } else {
+    normal_kernel<RNG>()(self, mean, std, gen);
+  }
+  return self;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
+  auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, std, gen);
+  output.add_(mean);
+  return output;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto mean_tensor = at::full({}, mean, output.options());
+  auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean_tensor, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std
+  output.mul_(std).add_(mean_tensor);
+  return output;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean + mean * std instead of mean + output * std
+  output.mul_(std).add_(mean);
+  return output;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+// ==================================================== Uniform =======================================================
+template<template<typename> class uniform_kernel, typename RNG>
+at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional<Generator> generator) {
+  if (self.is_complex()) {
+    CHECK_EMPTY_AND_RETURN(self);
+    auto float_tensor = at::view_as_real(self);
+    uniform_impl_<uniform_kernel, RNG>(float_tensor, from, to, generator);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] {
+      const auto dtype = self.dtype();
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype);
+      TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to);
+      TORCH_CHECK((to - from) <= std::numeric_limits<scalar_t>::max(),
+            "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()),
+            ">::max(), but found to=", to, " and from=", from,
+            " which result in to-from to exceed the limit");
+      from = std::min(std::max(from, min), max);
+      to = std::max(std::min(to, max), min);
+    });
+    CHECK_EMPTY_AND_RETURN(self);
+    auto iter = at::TensorIterator::borrowing_nullary_op(self);
+    uniform_kernel<RNG>()(iter, from, to, generator);
+  }
+  return self;
+}
+// ================================================== LogNormal =======================================================
+template<template<typename> class log_normal_kernel, typename RNG>
+at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  log_normal_kernel<RNG>()(iter, mean, std, gen);
+  return self;
+}
+// =================================================== Geometric ======================================================
+template<template<typename> class geometric_kernel, typename RNG>
+Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+  TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  geometric_kernel<RNG>()(iter, p, gen);
+  return self;
+}
+// ================================================== Exponential =====================================================
+template<template<typename> class exponential_kernel, typename RNG>
+Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+  TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  exponential_kernel<RNG>()(iter, lambda, gen);
+  return self;
+}
+// ==================================================== Cauchy ========================================================
+template<template<typename> class cauchy_kernel, typename RNG>
+Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+  // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
+  // the variance, squared sigma, is undefined for cauchy distribution
+  TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype());
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  cauchy_kernel<RNG>()(iter, median, sigma, gen);
+  return self;
+}
+// ==================================================== Bernoulli =====================================================
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  CHECK_EMPTY_AND_RETURN(self);
+  NoNamesGuard guard;
+  at::assert_no_internal_overlap(self);
+  bernoulli_tensor_kernel<RNG>()(self, p_, gen);
+  return self;
+}
+template<template<typename> class bernoulli_scalar_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  at::assert_no_internal_overlap(self);
+  bernoulli_scalar_kernel<RNG>()(self, p, gen);
+  return self;
+}
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional<Generator> gen) {
+  // result.resize_as_(self) requires self to have same dtype as result, so we
+  // use resize_ instead.
+  // TODO: Fix resize_as_. See pytorch/pytorch#11665.
+  result.resize_(self.sizes());
+  bernoulli_impl_<bernoulli_tensor_kernel, RNG>(result, self, gen);
+  namedinference::propagate_names(result, self);
+  return result;
+}
+#undef CHECK_OUT_OF_BOUNDS
+#undef WARN_OUT_OF_BOUNDS
+} // namespace at::native::templates

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+namespace at::native {
+using histogramdd_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&);
+using histogramdd_linear_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&, bool);
+using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges);
+DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub);
+DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub);
+DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h ADDED Viewed

	@@ -0,0 +1,41 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+namespace c10 {
+class Scalar;
+}
+namespace at::native {
+using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides);
+using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source);
+using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride);
+using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
+using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate);
+using take_fn = void(*)(TensorIterator & iter, const TensorBase& input);
+using flip_fn = void(*)(TensorIterator &, const bool);
+using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar);
+using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
+using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &);
+DECLARE_DISPATCH(index_fn, index_stub);
+DECLARE_DISPATCH(index_fill_fn, index_fill_stub);
+DECLARE_DISPATCH(index_copy_fn, index_copy_stub);
+DECLARE_DISPATCH(index_put_fn, index_put_stub);
+DECLARE_DISPATCH(put_fn, put_stub);
+DECLARE_DISPATCH(take_fn, take_stub);
+DECLARE_DISPATCH(flip_fn, flip_stub);
+DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_stub);
+DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h ADDED Viewed

	@@ -0,0 +1,160 @@

+#pragma once
+#include <ATen/ExpandUtils.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/core/IListRef.h>
+#include <c10/util/irange.h>
+namespace at::native {
+[[noreturn]]
+static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) {
+  TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx);
+}
+static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, IOptTensorListRef indices) {
+  // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
+  std::vector<Tensor> result;
+  for (const auto& index_opt : indices) {
+    if (!index_opt.has_value()) {
+      result.emplace_back();
+    } else {
+      const auto& index = *index_opt;
+      if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
+        if (index.scalar_type() == kByte) {
+          TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
+          " please use a dtype torch.bool instead.");
+        }
+        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
+        // corresponding dimensions in self
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = static_cast<int64_t>(result.size() + j);
+          if (index.size(j) != self.size(srcIdx)) {
+            invalid_mask(self, srcIdx, index, j);
+          }
+        }
+        // Replace with nonzeros
+        auto nonzero = index.nonzero();
+        for (const auto j : c10::irange(index.dim())) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      } else {
+        result.emplace_back(index);
+      }
+    }
+  }
+  return result;
+}
+static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) {
+  for (const auto& tensor : indices) {
+    if (tensor.has_value() && tensor->defined()) {
+      auto scalarType = tensor->scalar_type();
+      if (allow_int) {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors");
+        }
+      } else {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors");
+        }
+      }
+    }
+  }
+}
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const Tensor& a : list) {
+    result.push_back(a);
+  }
+  return result;
+}
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValue> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const IValue& a : list) {
+    result.push_back(a.isTensor() ? c10::optional<Tensor>(a.toTensor()) : c10::optional<Tensor>());
+  }
+  return result;
+}
+static C10_UNUSED bool hasContiguousSubspace(TensorList tl) {
+  // true if all the non-null tensors are adjacent
+  auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
+  auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
+  auto start = std::find_if(tl.begin(), tl.end(), isDefined);
+  auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined);
+  auto it = std::find_if(start, stop.base(), isNull);
+  return it == stop.base();
+}
+// Transposes the tensor and indices together so that all the non-null indices
+// index the first k dimensions of the tensor. Returns the transposed tensor
+// and the reordered indices. For example:
+// transposeToFront(tensor, {nullptr, a, nullptr, b})
+// returns
+// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
+static C10_UNUSED std::tuple<Tensor, std::vector<Tensor>>
+transposeToFront(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices));
+}
+inline std::tuple<Tensor, std::vector<Tensor>, std::vector<int64_t>>
+transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<int64_t> invPerm;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  invPerm.resize(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    invPerm[dims[i]] = i;
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm));
+}
+struct AdvancedIndex {
+  AdvancedIndex(const Tensor& src, TensorList indices);
+  Tensor src;
+  std::vector<Tensor> indices;
+  DimVector indexed_sizes;
+  DimVector indexed_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+};
+} //namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h ADDED Viewed

	@@ -0,0 +1,157 @@

+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+#include <torch/library.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/clone.h>
+#include <utility>
+#endif
+namespace at::native {
+// This fallback should only be used for operations that are self inverse and have a corresponding tensor
+// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit.
+// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit.
+// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called.
+// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
+struct MathOpFallback {
+  MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+  virtual bool is_bit_set(const Tensor&) = 0;
+  void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+    /*
+      Situations to handle:
+        1. Out-of-place operation.  Easy: materialize all inputs and
+          call it a day.
+        2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
+          Materialize other inputs as in (1).
+        3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
+        Materialize other inputs as in (1).
+        It is important to be able to tell if we READ from an argument and if we
+        WRITE to an argument.  Conservative approach is to assume that we always
+        READ from an argument, but in out= operations you can skip
+        conjugating inputs on entry that never get used. In the current schema we
+        can't easily tell if the operation is in in-place or out= operation.
+        Note:
+        1. Mutable tensorlists containing tensors whose math bit set to true are disallowed.
+        2. Mutable tensors with math bit set to true are unconditionally cloned to ensure
+           correct behavior in the case when the mutable tensor shares memory with non mutable arguments.
+           If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory
+           with these mutable inputs would read into wrong values in the following cases:
+           1. Non mutable inputs have their math bit set to false.
+           2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory
+              with one or more mutable arg(s)) are cloned.
+           At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs.
+    */
+    const auto& arguments = op.schema().arguments();
+    const auto num_arguments = arguments.size();
+    const auto stack_start = stack->size() - num_arguments;
+    c10::optional<bool> is_write;
+    for (const auto i : c10::irange(num_arguments)) {
+      // Three possible states:
+      // 1. alias_info has no value --> out-of-place operation
+      // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation
+      // 3. alias_info does have a value, alias_info->is_write=False --> view operation
+      const AliasInfo* alias_info = arguments[i].alias_info();
+      if (alias_info != nullptr) {
+        if (is_write.has_value()) {
+          TORCH_CHECK(*is_write == alias_info->isWrite(),
+            "Unsupported operator for ", op_name, " fallback: ", op.schema().name(),
+            op_name, " fallback doesn't work for operators with a mix "
+            "mutable and non-mutable inputs that alias with outputs, "
+            "this must be implemented manually.  "
+            "If you got this error on a core op, please report a bug to PyTorch.");
+        } else {
+          is_write = alias_info->isWrite();
+        }
+      }
+    }
+    if (is_write.has_value() && !*is_write) {
+      // We assume that view operators automatically handle the math bit
+      // correctly by propagating the dispatch key in key_set.
+      // This is not necessarily always right, so you should test these cases.
+      op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+      return;
+    }
+    // Mutable inputs with math bit set to True and their clones
+    std::vector<std::pair<Tensor, Tensor>> mutable_inputs_with_their_clones;
+    for (const auto i : c10::irange(num_arguments)) {
+      auto& ivalue = (*stack)[stack_start + i];
+      if (!(ivalue.isTensor() || ivalue.isTensorList())) {
+        continue;
+      }
+      const auto& argument = arguments[i];
+      bool mut_arg = false;
+      if (argument.alias_info()) {
+        // Was already tested by is_write loop above
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite());
+        mut_arg = true;
+      }
+      if (ivalue.isTensor()) {
+        if (!is_bit_set(ivalue.toTensor())) {
+          continue;
+        }
+        auto tensor = std::move(ivalue).toTensor();
+        auto resolved_tensor = at::clone(tensor);
+        if (mut_arg) {
+          TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ",
+            op_name, "bit set to true.");
+          mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor);
+        }
+        (*stack)[stack_start + i] = std::move(resolved_tensor);
+      } else if (ivalue.isTensorList()) {
+        auto tensors = std::move(ivalue).toTensorList();
+        for(const auto j : c10::irange(tensors.size())) {
+          const auto& tensor = tensors[j];
+          if (!is_bit_set(tensor)) {
+            continue;
+          }
+          TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ",
+              op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ",
+              op.schema().name());
+          tensors[j] = at::clone(tensor);
+        }
+        (*stack)[stack_start + i] = std::move(tensors);
+      }
+    }
+    op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+    TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1);
+    for (std::pair<Tensor, Tensor> mut_tensors: mutable_inputs_with_their_clones) {
+      auto& mutable_input =  mut_tensors.first;
+      auto& cloned_mutable_input =  mut_tensors.second;
+      auto& ivalue = (*stack)[stack_start];
+      auto returned_output = std::move(ivalue).toTensor();
+      // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input
+      TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output));
+      // necessary for out= arg
+      at::native::resize_output(mutable_input, returned_output.sizes());
+      mutable_input.copy_(returned_output);
+      (*stack)[stack_start] = std::move(mutable_input);
+    }
+  }
+  virtual ~MathOpFallback() = default;
+  DispatchKey key;
+  string op_name;
+};
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h ADDED Viewed

	@@ -0,0 +1,97 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Pool.h>
+namespace at::native {
+static void check_max_pool1d(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+  TORCH_CHECK(
+      self.dim() == 2 || self.dim() == 3,
+      "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes());
+  TORCH_CHECK(
+      kernel_size.size() == 1,
+      "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride.empty() || stride.size() == 1,
+      "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ",
+      stride.size());
+  TORCH_CHECK(
+      padding.size() == 1,
+      "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ",
+      padding.size());
+  TORCH_CHECK(
+      dilation.size() == 1,
+      "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ",
+      dilation.size());
+  // If stride=None then set it to kernel_size
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  TORCH_CHECK(
+      kernel_size[0] > 0,
+      "max_pool1d() kernel_size must be greater than zero, but got ",
+      kernel_size[0]);
+  TORCH_CHECK(
+      stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]);
+  TORCH_CHECK(
+      padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]);
+  TORCH_CHECK(
+      padding[0] <= kernel_size[0] / 2,
+      "max_pool1d() padding should be at most half of kernel size, but got padding=",
+      padding[0],
+      " and kernel_size=",
+      kernel_size[0]);
+  TORCH_CHECK(
+      dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]);
+  const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode);
+  TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW);
+}
+// TODO(Heitor) Template by dimension
+struct PoolingParams1D {
+  int64_t NB; // Number of batches
+  int64_t NC; // Number of channels
+  int64_t IW; // Input width
+  int64_t OW; // Output width
+  int64_t KW; // Kernel width
+  int64_t SJ; // Column stride
+  int64_t PJ; // Column padding
+  int64_t DJ; // Column dilation
+  // Return index of input element for the given kernel and output index
+  inline int64_t index(int64_t kj, int64_t oj) const {
+    return oj * SJ + kj * DJ - PJ;
+  }
+  // Return index of first output within bounds for this kernel index
+  inline int64_t valid_output_start(int64_t kj) const {
+    int64_t ij = index(kj, 0);;
+    return ij < 0 ? at::divup(-ij, SJ) : 0;
+  }
+  // Return index one past last output within bounds for this kernel index
+  inline int64_t valid_output_end(int64_t kj) const {
+    int64_t ij = index(kj, OW - 1);
+    return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW;
+  }
+};
+using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&);
+DECLARE_DISPATCH(pooling_fn, max_pool1d_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#include <ATen/core/TensorBase.h>
+#include <algorithm>
+#include <vector>
+namespace at::native {
+inline int64_t ensure_nonempty_dim(int64_t dim) {
+  return std::max<int64_t>(dim, 1);
+}
+inline int64_t ensure_nonempty_size(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.size(dim);
+}
+inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.stride(dim);
+}
+using IdxVec = std::vector<int64_t>;
+inline IdxVec ensure_nonempty_vec(IdxVec vec) {
+  if (vec.empty()) {
+    vec.push_back(1);
+  }
+  return vec;
+}
+}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Padding.h ADDED Viewed

	@@ -0,0 +1,62 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+namespace at::native {
+using padding_fn = void (*)(const Tensor&, const Tensor&, IntArrayRef);
+// reflection padding
+DECLARE_DISPATCH(padding_fn, reflection_pad1d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad1d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad2d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad2d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad3d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad3d_backward_kernel);
+// replication padding
+DECLARE_DISPATCH(padding_fn, replication_pad1d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad1d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad2d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad2d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad3d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad3d_backward_kernel);
+namespace padding {
+template <int dim>
+static inline void check_valid_input(const Tensor& input, IntArrayRef padding) {
+  TORCH_CHECK(padding.size() == 2 * dim,
+      "padding size is expected to be ", 2 * dim,
+      ", but got: ", padding.size());
+  int input_dim = input.dim();
+  bool is_batch_mode = input_dim == (dim + 2);
+  bool valid_batch_mode = is_batch_mode;
+  bool valid_non_batch_mode = !is_batch_mode;
+  if (is_batch_mode) {
+    // allow batch size of 0-dim.
+    for (const auto d : c10::irange(1, input_dim)) {
+      valid_batch_mode = valid_batch_mode && input.size(d) != 0;
+    }
+  } else {
+    for (const auto d : c10::irange(0, input_dim)) {
+      valid_non_batch_mode = valid_non_batch_mode && input.size(d) != 0;
+    }
+  }
+  // allow empty batch size but not other dimensions.
+  TORCH_CHECK(valid_batch_mode || valid_non_batch_mode,
+      "Expected ", dim + 1, "D or ", dim + 2,
+      "D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+}
+} // namespace padding
+} // at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/PointwiseOps.h ADDED Viewed

	@@ -0,0 +1,28 @@

+// Ternary and higher-order pointwise operations
+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+namespace native {
+using pointwise_fn = void (*)(TensorIterator&, const Scalar& scalar);
+using structured_pointwise_fn = void (*)(TensorIteratorBase&, const Scalar& scalar);
+using pointwise_fn_double = void (*)(TensorIterator&, const Scalar&, double);
+DECLARE_DISPATCH(structured_pointwise_fn, addcmul_stub);
+DECLARE_DISPATCH(structured_pointwise_fn, addcdiv_stub);
+DECLARE_DISPATCH(pointwise_fn_double, smooth_l1_backward_stub);
+DECLARE_DISPATCH(pointwise_fn_double, huber_backward_stub);
+DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pool.h ADDED Viewed

	@@ -0,0 +1,340 @@

+#include <ATen/core/Tensor.h>
+#include <ATen/div_rtn.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/irange.h>
+#include <utility>
+#pragma once
+namespace at::native {
+using max_pool2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input,
+    int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH);
+using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
+DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
+// averge pooling has same signature for forward and backward
+using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
+    int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
+DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
+using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
+    int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD);
+using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(max_pool3d_fn, max_pool3d_kernel);
+DECLARE_DISPATCH(max_pool3d_backward_fn, max_pool3d_backward_kernel);
+namespace {
+template <typename dest_t, typename src_t>
+static inline dest_t
+safe_downcast(src_t v)
+{
+  TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
+              "integer out of range");
+  return static_cast<dest_t>(v);
+}
+template<typename T>
+static inline T pooling_output_shape_pad_lr(
+        T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
+        bool ceil_mode) {
+    T outputSize = div_rtn<T>(
+        inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
+        (ceil_mode ? stride - 1 : 0), stride) + 1;
+    if (ceil_mode) {
+        // ensure that the last pooling starts inside the image
+        // needed to avoid problems in ceil mode
+        if ((outputSize - 1) * stride >= inputSize + pad_l) {
+          --outputSize;
+        }
+    }
+    return outputSize;
+}
+template<typename T>
+static inline T pooling_output_shape(
+      T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
+    TORCH_CHECK(stride != 0, "stride should not be zero");
+    TORCH_CHECK(pad >= 0,
+                "pad must be non-negative, but got pad: ", pad);
+    TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2,
+                "pad should be at most half of effective kernel size, but got pad=",
+                pad, ", kernel_size=", kernelSize, " and dilation=", dilation)
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
+}
+template <typename T>
+std::pair<T, T> _pooling_same_mode_padding_lr(
+    T inputSize, T kernelSize, T stride, T dilation) {
+  // NOTE: with strides, the output shape is ceil(inputSize/stride)
+  auto total_padding = T(dilation) * (kernelSize - 1);
+  // Prefer symmetric padding if possible
+  if (stride > 2 && (total_padding % 2 == 1)) {
+    // The floor in the output size calculation gives us a little wiggle room
+    auto wiggle_room = inputSize % stride - 1;
+    if (wiggle_room > 0) {
+      total_padding = total_padding - 1;
+    }
+  }
+  auto left = total_padding / 2;
+  return {left, total_padding - left};
+}
+inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
+    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
+inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
+    c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) {
+  return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation));
+}
+// AveragePool2d/DilatedMaxPool2d (forward)
+static inline void
+pool2d_shape_check(
+  const Tensor& input,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format)
+{
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got "
+              "dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationH > 0 && dilationW > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationH: ", dilationH, " dilationW: ", dilationW);
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+  if (memory_format == at::MemoryFormat::ChannelsLast){
+    // Expect tensor in NHWC format and allow 0-dim only for N.
+    TORCH_CHECK((ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+      " with optional 0 dim batch size for input, but got: ", input.sizes());
+  } else {
+    TORCH_CHECK((ndim == 3 && input.size(0) != 0 && valid_dims) ||
+      (ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
+      input.sizes());
+  }
+  TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
+              "pad should be smaller than or equal to half of kernel size, but got ",
+              "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size: (",
+              nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
+              "Calculated output size: (",
+              nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
+              "Output size is too small");
+}
+// DilatedMaxPool2d (backward)
+static inline void
+max_pool2d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format)
+{
+  pool2d_shape_check(
+    input,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+  check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane);
+  check_dim_size(gradOutput, ndim, ndim-2, outputHeight);
+  check_dim_size(gradOutput, ndim, ndim-1, outputWidth);
+  check_dim_size(indices, ndim, ndim-3, nOutputPlane);
+  check_dim_size(indices, ndim, ndim-2, outputHeight);
+  check_dim_size(indices, ndim, ndim-1, outputWidth);
+}
+// AveragePool2d (backward)
+static inline void
+avg_pool2d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  int64_t /*nbatch*/,
+  int kH, int kW, int dH, int dW, int padH, int padW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth,
+  MemoryFormat memory_format)
+{
+  pool2d_shape_check(
+    input,
+    kH, kW, dH, dW, padH, padW, 1, 1,
+    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+    memory_format);
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+  check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane);
+  check_dim_size(gradOutput, ndim, ndim-2, outputHeight);
+  check_dim_size(gradOutput, ndim, ndim-1, outputWidth);
+}
+// AveragePool3d/DilatedMaxPool3d (forward)
+static inline void
+pool3d_shape_check(
+  const Tensor& input,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char *fn_name,
+  bool check_input_size=false)
+{
+  const int64_t ndim = input.ndimension();
+  TORCH_CHECK(kT > 0 && kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kT: ", kT, " kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dT > 0 && dW > 0 && dH > 0,
+              "stride should be greater than zero, but got ",
+              "dT: ", dT, " dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationT > 0 && dilationW > 0 && dilationH > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationT: ", dilationT, " dilationH: ", dilationH, " dilationW: ", dilationW);
+  TORCH_CHECK(ndim == 4 || ndim == 5,
+              fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
+  for (const auto i : c10::irange(ndim)) {
+    if (ndim == 5 && i == 0) {
+      // size of batch-dim can be 0.
+      continue;
+    }
+    TORCH_CHECK(
+        input.size(i) > 0,
+        fn_name,
+        ": Expected input's non-batch dimensions to have positive length,"
+        " but input has a shape of ",
+        input.sizes(),
+        " and non-batch dimension ",
+        input.size(i),
+        " has length zero!")
+  }
+  if (check_input_size) { // AveragePool3d
+    TORCH_CHECK(itime >= kT && iheight >= kH && iwidth >= kW,
+                "input image ", "(T: ", itime, " H: ", iheight, " W: ", iwidth, ") smaller than ",
+                "kernel size ", "(kT: ", kT, " kH: ", kH, " kW: ", kW, ")");
+  }
+  TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH,
+              "pad should be smaller than or equal to half of kernel size, but got "
+              "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH);
+  TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1,
+              "Given input size: (",
+              nslices,"x", itime, "x", iheight, "x", iwidth, "). ",
+              "Calculated output size: (",
+              nslices, "x", otime, "x", oheight, "x", owidth, "). ",
+              "Output size is too small");
+}
+static inline void
+max_pool3d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char* fn_name)
+{
+  const int64_t ndim = input.ndimension();
+  pool3d_shape_check(
+    input,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    dilationT, dilationH, dilationW,
+    itime, iheight, iwidth,
+    otime, oheight, owidth, fn_name);
+  check_dim_size(gradOutput, ndim, ndim-4, nslices);
+  check_dim_size(gradOutput, ndim, ndim-3, otime);
+  check_dim_size(gradOutput, ndim, ndim-2, oheight);
+  check_dim_size(gradOutput, ndim, ndim-1, owidth);
+  check_dim_size(indices, ndim, ndim-4, nslices);
+  check_dim_size(indices, ndim, ndim-3, otime);
+  check_dim_size(indices, ndim, ndim-2, oheight);
+  check_dim_size(indices, ndim, ndim-1, owidth);
+}
+static inline void
+avg_pool3d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char *fn_name)
+{
+  const int64_t ndim = input.ndimension();
+  pool3d_shape_check(
+    input,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    1, 1, 1,
+    itime, iheight, iwidth,
+    otime, oheight, owidth,
+    fn_name, true);
+  check_dim_size(gradOutput, ndim, ndim-4, nslices);
+  check_dim_size(gradOutput, ndim, ndim-3, otime);
+  check_dim_size(gradOutput, ndim, ndim-2, oheight);
+  check_dim_size(gradOutput, ndim, ndim-1, owidth);
+}
+} // anonymous namespace
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RNN.h ADDED Viewed

	@@ -0,0 +1,53 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+namespace at::native {
+using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool);
+using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool);
+using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool);
+using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool);
+DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub);
+DECLARE_DISPATCH(lstm_fn, lstm_miopen_stub);
+DECLARE_DISPATCH(lstm_fn, lstm_mkldnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_miopen_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_miopen_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_miopen_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_miopen_stub);
+inline void check_attributes(const Tensor& input, const TensorList& params, const TensorList& hiddens, bool check_dtype=false) {
+  auto input_device = input.device();
+  auto input_dtype = input.scalar_type();
+  auto check_tensors = [&](const std::string& name, const Tensor& t) {
+    if (!t.defined()) return;
+    auto t_device = t.device();
+    TORCH_CHECK(input_device == t_device,
+             "Input and ", name, " tensors are not at the same device, found input tensor at ",
+             input_device, " and ", name, " tensor at ", t_device);
+    if (check_dtype) {
+      auto t_dtype = t.scalar_type();
+      TORCH_CHECK(input_dtype == t_dtype,
+               "Input and ", name, " tensors are not the same dtype, found input tensor with ",
+               input_dtype, " and ", name, " tensor with ", t_dtype);
+    }
+  };
+  for (const auto& h : hiddens) check_tensors("hidden", h);
+  for (const auto& p : params) check_tensors("parameter", p);
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h ADDED Viewed

	@@ -0,0 +1,48 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorOperators.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+namespace at::native {
+template <
+    typename index_t,
+    void compute(index_t*, int64_t*, index_t*, int64_t, int64_t)>
+static inline Tensor repeat_interleave_common(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size) {
+  TORCH_CHECK(
+      repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+  TORCH_CHECK(
+      repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt,
+      "repeats has to be Long or Int tensor");
+  if (repeats.size(0) == 0) {
+    return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeats_ = repeats.contiguous();
+  Tensor cumsum = repeats.cumsum(0);
+  int64_t total;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK(
+        (repeats >= 0).all().item<uint8_t>(), "repeats can not be negative");
+  }
+  Tensor result = at::empty({total}, repeats.options());
+  index_t* repeat_ptr = repeats_.data_ptr<index_t>();
+  int64_t* cumsum_ptr = cumsum.data_ptr<int64_t>();
+  index_t* result_ptr = result.data_ptr<index_t>();
+  compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total);
+  return result;
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h ADDED Viewed

	@@ -0,0 +1,173 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ResizeCommon.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorUtils.h>
+#include <c10/core/CPUAllocator.h>
+#include <utility>
+namespace at::native {
+// TODO: make all operations that resize given outputs use this function
+//   for consistency and maintainability.
+//   Some operations like `cat` might not be able to make the use of
+//   resize_output directly. For more details to understand how it works in `cat`,
+//   see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
+// Resizes outputs
+// Functions accepting output tensors, like with the "out" kwarg, should
+//   call this function to handle resizing their output tensor.
+// Issues a warning if the output tensor has one or more elements and
+//   needs resizing
+// NOTE: In the future the warning will become an error
+// Returns a bool saying whether or not the resize actually happened or not
+TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape);
+// WARNING: Do NOT call this directly. If you are resizing an output and want
+// to support dynamic shapes call at::resize__symint and resize_output_check_symint.
+// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272
+TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape);
+// Utility for resize_output
+//  Returns a bool saying resize should happen or not and
+//  raises a warning if resizing for one or more elements
+TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape);
+TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape);
+TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
+TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes);
+TORCH_API void resize_bytes_nocuda(const Storage& storage, c10::SymInt size_bytes);
+static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in cuda/Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+  const Storage& storage = self->unsafe_storage();
+  if (!storage) {
+    auto new_storage = c10::make_intrusive<StorageImpl>(
+        StorageImpl::use_byte_size_t(),
+        new_size_bytes,
+        c10::GetCPUAllocator(),
+        true);
+    self->set_storage_keep_dtype(std::move(new_storage));
+  } else if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+TORCH_API TensorImpl* resize_impl_cpu_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool resize_storage = true);
+template <typename T>
+T maybe_convert_symint(c10::SymInt) = delete;
+template <>
+inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; }
+template <>
+inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); }
+template <typename T>
+static inline void checkInBoundsForStorage(
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset,
+    const caffe2::TypeMeta& data_type,
+    const Storage& new_storage) {
+  T storage_size_bytes =
+      at::detail::computeStorageNbytes(size, stride, data_type.itemsize());
+  T storage_offset_bytes = storage_offset * data_type.itemsize();
+  if (storage_size_bytes == 0) {
+    // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel.
+    return;
+  }
+  T new_storage_size_bytes = maybe_convert_symint<T>(new_storage.sym_nbytes());
+  TORCH_CHECK(
+      storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes,
+      "setStorage: sizes ",
+      size,
+      ", strides ",
+      stride,
+      ","
+      " storage offset ",
+      storage_offset,
+      ", and itemsize ",
+      data_type.itemsize(),
+      " requiring a storage size of ",
+      storage_size_bytes + storage_offset_bytes,
+      " are out of bounds for storage of size ",
+      new_storage_size_bytes);
+}
+template <typename T>
+static inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset,
+                                   ArrayRef<T> size, ArrayRef<T> stride) {
+  // FIXME: stride should be optional
+  if (stride.data()) {
+    TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(),
+                                              ") and stride length (", stride.size(), ")");
+  }
+#ifdef DEBUG
+  TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX");
+#endif
+  // storage: note this can't be replaced with result.set_(storage) as the semantics of that
+  // function is to set the tensor size to be equal to the size of the storage.
+  if (!result.storage().is_alias_of(storage)) {
+    // Caffe2 might have tensors whose storages are null, but we
+    // don't allow it in PyTorch.
+    TORCH_INTERNAL_ASSERT(storage);
+    TORCH_INTERNAL_ASSERT(result.storage());
+    // We used to allow this, but this breaks device caching.
+    // Let's put an actual error message for this one.
+    TORCH_CHECK(result.storage().device() == storage.device(),
+                "Attempted to set the storage of a tensor on device \"", result.storage().device(),
+                "\" to a storage on different device \"", storage.device(),
+                "\".  This is no longer allowed; the devices must match.");
+    result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage));
+  }
+  // storageOffset
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+}
+/**
+ * Set self's sizes, strides, and storage_offset.
+ * (size, stride, storage_offset) must be in bounds for self's storage.
+ */
+template <typename T>
+inline void setStrided(
+    const Tensor& self,
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset) {
+  TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
+  for (const auto& val : stride) {
+    TORCH_CHECK(val >= 0,
+                "as_strided: Negative strides are not supported at the moment, "
+                "got strides: ", stride);
+  }
+  auto* self_ = self.unsafeGetTensorImpl();
+  checkInBoundsForStorage(
+      size, stride, storage_offset, self_->dtype(), self_->storage());
+  /* storage offset */
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+  self_->set_sizes_and_strides(size, stride, c10::make_optional(storage_offset));
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h ADDED Viewed

	@@ -0,0 +1,75 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/NamedTensorUtils.h>
+#include <c10/util/irange.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+namespace at::native {
+template <typename T>
+inline T storage_size_for(ArrayRef<T> size, ArrayRef<T> stride) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(),
+      "storage_size_for(size, stride) requires that size and stride ",
+      "have the same size as a precondition.");
+  T storage_size = 1;
+  for (const auto dim : c10::irange(size.size())) {
+    if (size[dim] == 0) {
+      storage_size = 0;
+      break;
+    }
+    storage_size += (size[dim] - 1) * stride[dim];
+  }
+  return storage_size;
+}
+inline const Tensor& resize_named_tensor_(
+    const Tensor& self,
+    IntArrayRef size,
+    c10::optional<MemoryFormat> optional_memory_format) {
+  TORCH_INTERNAL_ASSERT(self.has_names());
+  TORCH_CHECK(
+      self.sizes() == size,
+      "Cannot resize named tensor with resize_ or resize_as_ (tried to resize "
+      "Tensor",
+      self.names(),
+      " with size ",
+      self.sizes(),
+      " to ",
+      size,
+      "). This may be caused by passing a named tensor ",
+      "as an `out=` argument; please ensure that the sizes are the same. ");
+  TORCH_CHECK(
+      !optional_memory_format.has_value(),
+      "Unsupported memory format for named tensor resize ",
+      optional_memory_format.value());
+  return self;
+}
+// For deterministic output, fill new elements that were added after a storage
+// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage
+// before the resize happened.
+inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) {
+  const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage();
+  int64_t new_storage_nbytes = storage.nbytes();
+  int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize();
+  int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize();
+  if (new_storage_numel > old_storage_numel) {
+    at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device()));
+    tensor_view.set_(
+      storage,
+      /*storage_offset=*/old_storage_numel,
+      /*size=*/{new_storage_numel - old_storage_numel},
+      /*stride=*/{1});
+    at::native::fill_empty_deterministic_(tensor_view);
+  }
+  return tensor;
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h ADDED Viewed

	@@ -0,0 +1,544 @@

+#pragma once
+// Please note that this file is
+// used across both CPU and GPU.
+#include <type_traits>
+#include <complex>
+#include <c10/macros/Macros.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/NumericUtils.h>
+#if defined(__CUDACC__)
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/native/cuda/DeviceSqrt.cuh>
+#elif defined(__HIPCC__)
+#include <ATen/hip/DeviceUtils.cuh>
+#include <ATen/native/hip/DeviceSqrt.cuh>
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/pair.h>
+#else
+#include <cmath>
+#define device_sqrt std::sqrt
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename scalar_t>
+inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
+#else
+  scalar_t max = at::_isnan(b) ? b : std::max(a, b);
+#endif
+  return max;
+}
+template <typename scalar_t>
+inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
+#else
+  scalar_t min = at::_isnan(b) ? b : std::min(a, b);
+#endif
+  return min;
+}
+#define MAX(X, Y) max_propagate_nan(X,Y)
+#define MIN(X, Y) min_propagate_nan(X,Y)
+#else
+#include <ATen/native/cpu/zmath.h>
+#define MAX(X, Y) max_impl(X,Y)
+#define MIN(X, Y) min_impl(X,Y)
+#endif
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define compat_pow c10::cuda::compat::pow
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define compat_pow c10::hip::compat::pow
+#else
+#define compat_pow std::pow
+#endif
+namespace at { namespace native {
+namespace detail {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T1, typename T2> using pair = thrust::pair<T1, T2>;
+#else
+template <typename T1, typename T2> using pair = std::pair<T1, T2>;
+#endif
+} // namespace detail
+template <typename scalar_t, typename index_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  scalar_t nf;
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      scalar_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
+struct WelfordOps {
+  acc_scalar_t correction;
+  bool take_sqrt;
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    // We accumulate n in index_t to avoid cumulative rounding error, but still
+    // need nf for use in combine where int32 may overflow.
+    index_t new_n = acc.n + 1;
+    acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
+    acc_scalar_t delta = data - acc.mean;
+    acc_scalar_t new_mean = acc.mean + delta / new_nf;
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+      new_mean,
+      acc.m2 + delta * new_delta,
+      new_n,
+      new_nf,
+    };
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    acc_scalar_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
+  }
+  inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
+    const auto mean = static_cast<scalar_t>(acc.mean);
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto var = acc.m2 / divisor;
+    res_t results(take_sqrt ? device_sqrt(var) : var, mean);
+    return results;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.mean, offset)
+      , WARP_SHFL_DOWN(acc.m2, offset)
+      , WARP_SHFL_DOWN(acc.n, offset)
+      , WARP_SHFL_DOWN(acc.nf, offset)
+    };
+  }
+#endif
+  C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt)
+      : correction(correction), take_sqrt(take_sqrt) {}
+};
+template <typename scalar_t, typename acc_t=scalar_t, typename factor_t=acc_t, typename out_t = acc_t>
+struct MeanOps {
+  factor_t factor;
+  inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const {
+    return combine(a, static_cast<acc_t>(b));
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a * factor;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+  MeanOps(factor_t factor): factor(factor) {
+  }
+};
+// This accumulator template is used to calculate the minimum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMinOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MIN(acc, static_cast<acc_t>(std::abs(data)));
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MIN(a, b);
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+// This accumulator template is used to calculate the maximum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMaxOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MAX(acc, static_cast<acc_t>(std::abs(data)));
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MAX(a, b);
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+// This accumulator template is used to calculate the norm of the absolute value
+// of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOps {
+  acc_t norm_;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + compat_pow(static_cast<acc_t>(std::abs(data)), norm_);
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+  NormOps(acc_t norm_): norm_(norm_) {
+  }
+};
+// This accumulator template is used to calculate the order zero norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormZeroOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0) : static_cast<acc_t>(1));
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+// This accumulator template is used to calculate the order one norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOneOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + static_cast<acc_t>(std::abs(data));
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+template<typename acc_t>
+struct AbsSwitch {};
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(data);
+}
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+// This accumulator template is used to calculate the order two norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormTwoOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
+    return acc + data_ * data_;
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return device_sqrt(a);
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+template <typename acc_t, typename data_t>
+struct NanSumOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const {
+    return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b});
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return  a + b;
+  }
+  inline C10_DEVICE data_t project(acc_t a) const {
+    return data_t{a};
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+namespace detail {
+template <typename scalar_t>
+struct LessOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else min(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a < b);
+  }
+};
+template <typename scalar_t>
+struct GreaterOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else max(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a > b);
+  }
+};
+template <typename comp_t>
+struct MinMaxReductionOps {
+  using scalar_t = typename binary_function_traits<comp_t>::arg1_t;
+  using index_t = int64_t;
+  using arg_t = detail::pair<scalar_t, index_t>;
+  static C10_DEVICE arg_t project(arg_t arg) {
+    return arg;
+  }
+  static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) {
+    return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx);
+  }
+  static C10_DEVICE arg_t combine(arg_t a, arg_t b) {
+    return comp_t{}(a.first, b.first, a.second, b.second) ? a : b;
+  }
+  static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) {
+    return {a.first, a.second + base_idx};
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) {
+    return arg_t(WARP_SHFL_DOWN(arg.first, offset),
+                 WARP_SHFL_DOWN(arg.second, offset));
+  }
+#endif
+};
+template <typename comp_t>
+struct ArgReductionOps : public MinMaxReductionOps<comp_t> {
+  using typename MinMaxReductionOps<comp_t>::scalar_t;
+  using typename MinMaxReductionOps<comp_t>::index_t;
+  using typename MinMaxReductionOps<comp_t>::arg_t;
+  static C10_DEVICE index_t project(arg_t arg) {
+    return arg.second;
+  }
+};
+} // namespace detail
+template <typename scalar_t>
+struct ArgMaxOps :
+  public detail::ArgReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+template <typename scalar_t>
+struct ArgMinOps :
+  public detail::ArgReductionOps<detail::LessOrNan<scalar_t>> {
+};
+template <typename scalar_t>
+struct MinOps :
+  public detail::MinMaxReductionOps<detail::LessOrNan<scalar_t>> {
+};
+template <typename scalar_t>
+struct MaxOps :
+  public detail::MinMaxReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+template <typename scalar_t, typename acc_scalar_t, typename index_t>
+struct MinMaxOps {
+  using acc_t = detail::pair<acc_scalar_t, acc_scalar_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    return combine(acc, {data, data});
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first;
+    auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second;
+    return {min_val, max_val};
+  }
+  inline C10_DEVICE acc_t project(acc_t acc) const {
+    return acc;
+  }
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset)
+    };
+  }
+#endif
+};
+}} // namespace at::native
+#undef MAX
+#undef MIN

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h ADDED Viewed

	@@ -0,0 +1,190 @@

+#pragma once
+#include <ATen/Parallel.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/core/Tensor.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#endif
+namespace at::sparse {
+// Just for documentary purposes
+using SparseTensor = Tensor;
+using SparseType = Type;
+// This is an internal utility function for getting at the SparseTensorImpl,
+// so that we can write sparse tensor specific accessors for special fields
+// in SparseTensor.  You should only use this for writing low level
+// setters/getters for SparseTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+//
+// This may be called repeatedly, so make sure it's pretty cheap.
+inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+  TORCH_INTERNAL_ASSERT(
+      self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
+  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
+}
+// Takes indices and values and directly puts them into the sparse tensor, no
+// copy.  This used to be called THSTensor_(_move)
+inline void alias_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values) {
+  get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
+}
+// Take indices and values and makes a (data) copy of them to put into the
+// sparse indices/values.  This used to be called THSTensor_(_set)
+inline void copy_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values,
+    bool non_blocking) {
+  alias_into_sparse(
+      self,
+      indices.to(self._indices().options(), non_blocking, /*copy=*/true),
+      values.to(self._values().options(), non_blocking, /*copy=*/true));
+}
+// TODO: put this into the public API
+inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) {
+  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
+}
+inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) {
+  return self.sparse_dim() == src.sparse_dim() &&
+      self.dense_dim() == src.dense_dim();
+}
+// Give us a new values tensor, with the same dimensionality
+// as 'values' but with a new number of non-zero elements.
+// TODO: Expose this for real in ATen, some day?
+// NB: Doesn't preserve data.
+inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
+  std::vector<int64_t> size = values.sizes().vec();
+  size[0] = nnz;
+  return at::empty(size, values.options());
+}
+// NOTE [ Flatten Sparse Indices ]
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+TORCH_API Tensor flatten_indices(
+    const Tensor& indices,
+    IntArrayRef full_size,
+    bool force_clone = false);
+// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten
+// Sparse Indices ], except this one allows partial flatten: only flatten on
+// specified dims. Note that the flatten indices might be uncoalesced if
+// dims_to_flatten.size() < sparse_dim. Also if input indices is already
+// coalesced, the flattened indices will also be sorted.
+//
+// args:
+//    indices: sparse tensor indices
+//    sizes: sparse tensor sizes
+//    dims_to_flatten: a list of dim index to flatten
+//
+// Ex1:
+//   indices = [[2, 4, 0],
+//             [3, 1, 3]]
+//   sizes = [2, 12]
+//   dims_to_flatten = [0, 1]
+//   new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3]
+//
+// Ex2:
+//   dims_to_flatten = [1]
+//   new_indices = [ 3, 1, 3 ]  # uncoalesced
+TORCH_API Tensor flatten_indices_by_dims(
+    const Tensor& indices,
+    const IntArrayRef& sizes,
+    const IntArrayRef& dims_to_flatten);
+// Find the CSR representation for a row `indices` from the COO format
+TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
+TORCH_API Tensor zeros_like_with_indices(const Tensor& t);
+template <size_t static_shape_max_len>
+class TensorGeometryHolder {
+  using geometry_holder_t = std::array<int64_t, static_shape_max_len>;
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options = {}) {
+    std::copy(sizes.begin(), sizes.end(), t_sizes.begin());
+    std::copy(strides.begin(), strides.end(), t_strides.begin());
+  }
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides()) {}
+  auto operator*() const {
+    return std::make_tuple(t_sizes, t_strides);
+  }
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+template <>
+class TensorGeometryHolder<0> {
+  using geometry_holder_t = Tensor;
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options) {
+    const int64_t t_ndims = sizes.size();
+    const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU);
+    Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options);
+    t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options));
+    t_sizes_and_strides_cpu.select(0, 1).copy_(
+        at::tensor(strides, cpu_options));
+    const Tensor t_sizes_and_strides =
+        t_sizes_and_strides_cpu.to(options.device());
+    t_sizes = t_sizes_and_strides.select(0, 0);
+    t_strides = t_sizes_and_strides.select(0, 1);
+  }
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {}
+  auto operator*() const {
+    return std::make_tuple(
+        t_sizes.template data_ptr<int64_t>(),
+        t_strides.template data_ptr<int64_t>());
+  }
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+// Return all indices of a tensor with the given shape.
+//
+// full_coo_indices(shape) is equivalent to
+// torch.ones(shape).nonzero().transpose(-2, -1) but much faster.
+TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options);
+} // namespace at::sparse

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/StridedRandomAccessor.h ADDED Viewed

	@@ -0,0 +1,301 @@

+#pragma once
+namespace at::native {
+// (Const)StridedRandomAccessor is a
+// (const) random access iterator defined over
+// a strided array.
+// The traits below are to introduce __restrict__
+// modifier on different platforms.
+template <typename T>
+struct DefaultPtrTraits {
+  using PtrType = T*;
+};
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+template <typename T>
+struct RestrictPtrTraits {
+  using PtrType = T* RESTRICT;
+};
+template <
+  typename T,
+  typename index_t = int64_t,
+  template <typename U> class PtrTraits = DefaultPtrTraits
+>
+class ConstStridedRandomAccessor {
+public:
+  using difference_type = index_t;
+  using value_type = const T;
+  using pointer = const typename PtrTraits<T>::PtrType;
+  using reference = const value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+  using PtrType = typename PtrTraits<T>::PtrType;
+  using index_type = index_t;
+  // Constructors {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor(PtrType ptr, index_t stride)
+    : ptr{ptr}, stride{stride}
+  {}
+  C10_HOST_DEVICE
+  explicit ConstStridedRandomAccessor(PtrType ptr)
+    : ptr{ptr}, stride{static_cast<index_t>(1)}
+  {}
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor()
+    : ptr{nullptr}, stride{static_cast<index_t>(1)}
+  {}
+  // }
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return *ptr;
+  }
+  C10_HOST_DEVICE
+  const value_type* operator->() const {
+    return reinterpret_cast<const value_type*>(ptr);
+  }
+  C10_HOST_DEVICE
+  reference operator[](index_t idx) const {
+    return ptr[idx * stride];
+  }
+  // }
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator++() {
+    ptr += stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator++(int) {
+    ConstStridedRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator--() {
+    ptr -= stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator--(int) {
+    ConstStridedRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator+=(index_t offset) {
+    ptr += offset * stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator+(index_t offset) const {
+    return ConstStridedRandomAccessor(ptr + offset * stride, stride);
+  }
+  C10_HOST_DEVICE
+  friend ConstStridedRandomAccessor operator+(
+    index_t offset,
+    const ConstStridedRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator-=(index_t offset) {
+    ptr -= offset * stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator-(index_t offset) const {
+    return ConstStridedRandomAccessor(ptr - offset * stride, stride);
+  }
+  // Note that this operator is well-defined when `this` and `other`
+  // represent the same sequences, i.e. when
+  // 1. this.stride == other.stride,
+  // 2. |other - this| / this.stride is an Integer.
+  C10_HOST_DEVICE
+  difference_type operator-(const ConstStridedRandomAccessor& other) const {
+    return (ptr - other.ptr) / stride;
+  }
+  // }
+  // Comparison operators {
+  C10_HOST_DEVICE
+  bool operator==(const ConstStridedRandomAccessor& other) const {
+    return (ptr == other.ptr) && (stride == other.stride);
+  }
+  C10_HOST_DEVICE
+  bool operator!=(const ConstStridedRandomAccessor& other) const {
+    return !(*this == other);
+  }
+  C10_HOST_DEVICE
+  bool operator<(const ConstStridedRandomAccessor& other) const {
+    return ptr < other.ptr;
+  }
+  C10_HOST_DEVICE
+  bool operator<=(const ConstStridedRandomAccessor& other) const {
+    return (*this < other) || (*this == other);
+  }
+  C10_HOST_DEVICE
+  bool operator>(const ConstStridedRandomAccessor& other) const {
+    return !(*this <= other);
+  }
+  C10_HOST_DEVICE
+  bool operator>=(const ConstStridedRandomAccessor& other) const {
+    return !(*this < other);
+  }
+  // }
+protected:
+  PtrType ptr;
+  index_t stride;
+};
+template <
+  typename T,
+  typename index_t = int64_t,
+  template <typename U> class PtrTraits = DefaultPtrTraits
+>
+class StridedRandomAccessor
+  : public ConstStridedRandomAccessor<T, index_t, PtrTraits> {
+public:
+  using difference_type = index_t;
+  using value_type = T;
+  using pointer = typename PtrTraits<T>::PtrType;
+  using reference = value_type&;
+  using BaseType = ConstStridedRandomAccessor<T, index_t, PtrTraits>;
+  using PtrType = typename PtrTraits<T>::PtrType;
+  // Constructors {
+  C10_HOST_DEVICE
+  StridedRandomAccessor(PtrType ptr, index_t stride)
+    : BaseType(ptr, stride)
+  {}
+  C10_HOST_DEVICE
+  explicit StridedRandomAccessor(PtrType ptr)
+    : BaseType(ptr)
+  {}
+  C10_HOST_DEVICE
+  StridedRandomAccessor()
+    : BaseType()
+  {}
+  // }
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return *this->ptr;
+  }
+  C10_HOST_DEVICE
+  value_type* operator->() const {
+    return reinterpret_cast<value_type*>(this->ptr);
+  }
+  C10_HOST_DEVICE
+  reference operator[](index_t idx) const {
+    return this->ptr[idx * this->stride];
+  }
+  // }
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator++() {
+    this->ptr += this->stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator++(int) {
+    StridedRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator--() {
+    this->ptr -= this->stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator--(int) {
+    StridedRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator+=(index_t offset) {
+    this->ptr += offset * this->stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator+(index_t offset) const {
+    return StridedRandomAccessor(this->ptr + offset * this->stride, this->stride);
+  }
+  C10_HOST_DEVICE
+  friend StridedRandomAccessor operator+(
+    index_t offset,
+    const StridedRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator-=(index_t offset) {
+    this->ptr -= offset * this->stride;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator-(index_t offset) const {
+    return StridedRandomAccessor(this->ptr - offset * this->stride, this->stride);
+  }
+  // Note that here we call BaseType::operator- version
+  C10_HOST_DEVICE
+  difference_type operator-(const BaseType& other) const {
+    return (static_cast<const BaseType&>(*this) - other);
+  }
+  // }
+};
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h ADDED Viewed

	@@ -0,0 +1,92 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/TensorIterator.h>
+namespace at::native {
+namespace {
+static std::string shapes_as_str(TensorList tensors) {
+  std::ostringstream os;
+  bool first = true;
+  for (auto& tensor : tensors) {
+    if (tensor.defined()) {
+      if (!first) {
+        os << ", ";
+      }
+      os << tensor.sizes();
+      first = false;
+    }
+  }
+  return os.str();
+}
+} // anonymous namespace
+static std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<c10::optional<at::Tensor>>& indices,
+const Tensor& value){
+  if (!(value.numel() ==1 && value.device().is_cpu())){
+    return std::make_tuple(false,Tensor());
+  }
+  int64_t num_ind = 0;
+  Tensor mask;
+  auto self_device = self.device();
+  for (const c10::optional<Tensor>& i: indices) {
+    if (!i.has_value() || !(*i).defined()){
+      num_ind++;
+    } else {
+      const Tensor &index = *i;
+      if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
+          index.device() != self_device || mask.defined()){
+        return std::make_tuple(false, Tensor());
+      } else {
+        mask = index;
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = num_ind + j;
+          TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx);
+        }
+        num_ind += mask.ndimension();
+      }
+    }
+  }
+  for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) {
+    mask = mask.unsqueeze(-1);
+  }
+  return std::make_tuple(true, mask);
+}
+static AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
+  checkIndexTensorTypes(orig, /*allow_int*/ true);
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  auto indices = expandTensors(self, orig);
+  // next broadcast all index tensors together
+  try {
+    indices = expand_outplace(indices);
+  } catch (std::exception& e) {
+    TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
+                   " with shapes ", shapes_as_str(indices));
+  }
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices) = transposeToFront(self, indices);
+  }
+  // Ensure indices are on the same device as self
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.device() != self.device()) {
+      indice = indice.to(self.device());
+    }
+  }
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.dtype() == at::kInt) {
+      indice = indice.to(at::kLong);
+    }
+  }
+  return AdvancedIndex(self, indices);
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+namespace at::native {
+//input tensors are non-zero dim and non-empty
+template<typename T1, typename T2, typename Function>
+void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) {
+  int ndims = self.dim();
+  int tensor_dim_apply_has_finished = 0;
+  std::vector<int64_t> counter(ndims, 0);
+  const T1* self_data = self.const_data_ptr<T1>();
+  T1* values_data = values.data_ptr<T1>();
+  T2* indices_data = indices.data_ptr<T2>();
+  int64_t self_stride = self.stride(dim);
+  int64_t values_stride = values.stride(dim);
+  int64_t indices_stride = indices.stride(dim);
+  int self_dim_size = self.size(dim);
+  while (!tensor_dim_apply_has_finished) {
+    func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride);
+    if (ndims == 1) {
+       break;
+    }
+    for (const auto dim_i : c10::irange(ndims)) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+      self_data += self.stride(dim_i);
+      values_data += values.stride(dim_i);
+      indices_data += indices.stride(dim_i);
+      if (counter[dim_i] == self.size(dim_i)) {
+        if (dim_i == ndims-1) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        } else {
+          self_data -= counter[dim_i]*self.stride(dim_i);
+          values_data -= counter[dim_i]*values.stride(dim_i);
+          indices_data -= counter[dim_i]*indices.stride(dim_i);
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+     }
+    }
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h ADDED Viewed

	@@ -0,0 +1,142 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/native/DispatchStub.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+namespace at::native {
+// Different combinations of row, col, and offset can lead to two cases:
+//
+// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col
+//    Example A: offset > 0
+//      1 1 0 0 0
+//      1 1 1 0 0
+//      1 1 1 1 0
+//    Example B: offset <= 0
+//      0 0 0
+//      1 0 0
+//      1 1 0
+//    In this case, we calculate the number of elements in the first row and
+//    last row of the tril respectively, and then compute the tril size.
+//
+// Case 2 - Trapezoid + Rectangle: row + offset > col
+//    Example:
+//      1 1 0
+//      1 1 1
+//      1 1 1
+//    In this case, we first calculate the size of top trapezoid, and then
+//    calculate the size of the bottom rectangle.
+inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
+  // If either dimension is 0 then the there is no tril
+  if (row == 0 || col == 0) {
+    return 0;
+  }
+  // number of elements in the first row of the tril
+  auto m_first_row = offset > 0 ?
+    std::min<int64_t>(col, 1 + offset) : // upper bounded by col
+    row + offset > 0; // either 0 or 1
+  // number of elements in the last row of the tril, bounded by [0, col]
+  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
+  // number of rows, bounded by [0, row]
+  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
+  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
+  // calculate # of elements in the top trapezoid
+  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
+  // calculate # of elements in the bottom rectangle if there is any
+  auto diff_row = n_row_all - n_row_trapezoid;
+  if (diff_row > 0) {
+    tril_size += diff_row * col;
+  }
+  return tril_size;
+}
+inline void check_args(
+    int64_t row, int64_t col, c10::optional<Layout> layout_opt) {
+  TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
+  TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
+  if (layout_opt.has_value()) {
+    TORCH_CHECK(
+      *layout_opt == at::kStrided,
+      "only support layout=torch.strided, got",
+      *layout_opt)
+  }
+}
+using at::check_size_nonnegative;
+// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n))
+inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
+  // match defined() to behavior of checks below
+  TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(),
+              "n is too large for result tensor type: '", tensor.toString(), "'");
+  // Ensure sufficient precision for floating point representation.
+  switch (tensor.scalar_type()) {
+    case at::ScalarType::Half:
+      TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
+      break;
+    case at::ScalarType::Float:
+      TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
+      break;
+    case at::ScalarType::Double:  // Unlikely to happen, but doesn't hurt to check
+      TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
+      break;
+    default:
+      break;
+  }
+}
+// Called by `empty*` functions when deterministic algorithms are enabled to
+// fill the tensor with NaN if it is floating point or complex type, or fill
+// with max value if it is integer type
+inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
+  if (tensor.is_floating_point() || tensor.is_complex()) {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kBFloat16, kHalf, tensor.scalar_type(), "fill_empty_deterministic_", [&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
+    });
+  } else {
+    AT_DISPATCH_V2(
+      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::max());
+    }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2));
+  }
+  return tensor;
+}
+// The ZeroTensor allocator ignores whatever allocation is requested and always
+// gives you nullptr
+struct ZeroTensorAllocator final : public at::Allocator {
+  ZeroTensorAllocator(at::Device device) : device_(device) {};
+  ~ZeroTensorAllocator() override = default;
+  static void deleter(void* const pointer) {
+    TORCH_INTERNAL_ASSERT(!pointer);
+  }
+  DataPtr allocate(const size_t /*nbytes*/) override {
+    return {nullptr, nullptr, &deleter, device_};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return deleter;
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {}
+  at::Device device_;
+};
+using binary_fn = void (*)(TensorIterator&);
+DECLARE_DISPATCH(binary_fn, complex_stub);
+DECLARE_DISPATCH(binary_fn, polar_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h ADDED Viewed

	@@ -0,0 +1,52 @@

+#pragma once
+#include <complex>
+#include <type_traits>
+#include <c10/core/ScalarType.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
+// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
+// to the function that is being called.
+// On CUDA, the cast is currently pushed down into the kernel (for performance reasons).
+// On CPU, there is currently an internal assert that a dynamic_cast is not needed.
+namespace at::native {
+// `needs_dynamic_casting` compares the types expected by iterator
+// (i.e. dtypes of the operands) with the actual type of the arguments
+// (and returns) of func_t
+template<typename func_t, int nargs=function_traits<func_t>::arity>
+struct needs_dynamic_casting {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::template arg<nargs - 1>::type;
+    using cpp_map = c10::CppTypeToScalarType<cpp_type>;
+    if (iter.input_dtype(nargs-1) != cpp_map::value) {
+      return true;
+    }
+    return needs_dynamic_casting<func_t, nargs - 1>::check(iter);
+  }
+};
+template<typename func_t>
+struct needs_dynamic_casting<func_t, 0> {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::result_type;
+    // we could assert output numbers are correct here, but checks
+    // (including arity) are currently pushed outside of this struct.
+    if constexpr (std::is_void_v<cpp_type>) {
+      return false;
+    } else {
+      return iter.dtype(0) != c10::CppTypeToScalarType<cpp_type>::value;
+    }
+  }
+};
+} //namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorProperties.h ADDED Viewed

	@@ -0,0 +1,12 @@

+#pragma once
+// See NOTE: [Tensor vs. TensorBase]
+namespace at {
+class TensorBase;
+}
+namespace at::native {
+TORCH_API bool cudnn_is_acceptable(const TensorBase& self);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h ADDED Viewed

	@@ -0,0 +1,105 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <ATen/core/IListRef.h>
+namespace at::native {
+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
+inline bool cat_should_skip_tensor(const Tensor& t) {
+  return t.numel() == 0 && t.dim() == 1;
+}
+ // Check to see if the shape of tensors is compatible
+ // for being concatenated along a given dimension.
+inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) {
+   int64_t first_dims = first.dim();
+   int64_t second_dims = second.dim();
+   TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ",
+               first_dims, " and ", second_dims);
+   for (const auto dim : c10::irange(first_dims)) {
+     if (dim == dimension) {
+       continue;
+     }
+     int64_t first_dim_size = first.sizes()[dim];
+     int64_t second_dim_size = second.sizes()[dim];
+     TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
+                 dimension, ". Expected size ", static_cast<long long>(first_dim_size), " but got size ", static_cast<long long>(second_dim_size), " for tensor number ", index, " in the list.");
+   }
+ }
+inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) {
+  int64_t i = 0;
+  for(const Tensor& t : tensors) {
+    TORCH_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+    i++;
+  }
+}
+inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  int64_t dim_size = self.size(dim);
+  TORCH_CHECK(split_size > 0 || dim_size == 0,
+           "split_size can only be 0 if dimension size is 0, "
+           "but got dimension size of ", dim_size);
+  // if split_size is 0 and dimension size is 0, there is 1 split.
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
+    // (returns a single split).  We might want to error here, but keep it for BC.
+    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
+  }
+  return num_splits;
+}
+inline bool have_same_ndims(TensorList tensors) {
+  auto ndim = tensors[0].dim();
+  for (const auto tensor_idx : c10::irange(tensors.size())) {
+    if(tensors[tensor_idx].dim() != ndim) {
+      return false;
+    }
+  }
+  return true;
+}
+inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
+  auto tensor_zero_size = tensors[0].sizes();
+  std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
+  for (const auto i : c10::irange(tensors.size())) {
+    at::Tensor tensor = tensors[i];
+    for(const auto j : c10::irange(dim)) {
+      TORCH_CHECK(
+        tensor.size(j) == leading_dim_sizes[j],
+        "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
+      );
+    }
+  }
+}
+inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
+  TORCH_CHECK(!tensors.empty(),
+           "_chunk_cat expects a non-empty input tensor list");
+  auto expected_dtype = tensors[0].dtype();
+  auto expected_device = tensors[0].device();
+  for(const auto i : c10::irange(tensors.size())) {
+    TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
+    TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
+    TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
+  }
+  if (have_same_ndims(tensors)) {
+    dim = maybe_wrap_dim(dim, tensors[0].dim());
+  } else {
+    TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
+    for(const auto i : c10::irange(tensors.size())) {
+      TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
+    }
+  }
+  leading_dimension_matches(tensors, dim);
+  return dim;
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TopKImpl.h ADDED Viewed

	@@ -0,0 +1,98 @@

+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/NumericUtils.h>
+namespace at::native {
+#ifdef CPU_CAPABILITY
+inline namespace CPU_CAPABILITY {
+#else
+inline namespace DEFAULT {
+#endif
+// Core topk loop, shared between CPU and QuantizedCPU
+template <typename scalar_t, typename accscalar_t>
+void topk_impl_loop(
+    const int64_t mode_values_stride,
+    const int64_t mode_indices_stride,
+    const int64_t tmp_values_stride,
+    const int64_t k,
+    const int64_t dim_size,
+    const bool largest,
+    const bool sorted,
+    char** data, const int64_t* strides, const int64_t n) {
+  // If k is zero, then output values and indices are empty tensors
+  // So iterating over other dims is pointless
+  if (k == 0) {
+    return;
+  }
+  using elem_t = std::pair<accscalar_t, int64_t>;
+  std::vector<elem_t> queue(dim_size);
+  for (const auto i : c10::irange(n)) {
+    TensorAccessor<scalar_t, 1> mode_values(
+        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
+        &k, &mode_values_stride);
+    TensorAccessor<int64_t, 1> mode_indices(
+        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
+        &k, &mode_indices_stride);
+    TensorAccessor<const scalar_t, 1> tmp_values(
+        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
+        &dim_size, &tmp_values_stride);
+    auto n_2 = dim_size;
+    auto use_partial_sort = k * 64 <= n_2;
+    for (const auto j : c10::irange(n_2)) {
+      queue[j].first = tmp_values[j];
+      queue[j].second = j;
+    }
+    // we want nan to be sorted as top for numpy compatibility
+    if (use_partial_sort) {
+      if (largest) {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+      } else {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+      }
+    } else {
+      if (largest) {
+        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k - 1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+            });
+        }
+      } else {
+        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k -1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+            });
+        }
+      }
+    }
+    for (const auto j : c10::irange(k)) {
+      mode_values[j] = queue[j].first;
+      mode_indices[j] = queue[j].second;
+    }
+  }
+}
+} // namespace CPU_CAPABILITY
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TransposeType.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+#include <c10/util/Exception.h>
+namespace at::native {
+// Used as an interface between the different BLAS-like libraries
+enum class TransposeType {
+  NoTranspose,
+  Transpose,
+  ConjTranspose,
+};
+// Transforms TransposeType into the BLAS / LAPACK format
+static inline char to_blas(TransposeType trans) {
+  switch (trans) {
+    case TransposeType::Transpose: return 'T';
+    case TransposeType::NoTranspose: return 'N';
+    case TransposeType::ConjTranspose: return 'C';
+  }
+  TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
+}
+}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <c10/core/ScalarType.h>
+namespace at::native {
+void Unfold3dCopyCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void* dst);
+void Unfold3dAccCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void *dst);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h ADDED Viewed

	@@ -0,0 +1,112 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/NonEmptyUtils.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#endif
+namespace at::native {
+using unfold_backward_fn = void (*)(
+  Tensor& grad_in,
+  const Tensor& grad,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+);
+DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub);
+namespace {
+// Note on naming: it is unconventional.
+// grad_in does not mean that it is a gradient wrt to input,
+// grad_in/grad_out is just an input/output of unfold_backward kernel.
+static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out(
+  Tensor& grad_out,
+  const Tensor& grad_in,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+) {
+  dim = maybe_wrap_dim(dim, grad_out.dim());
+  // last dim stores the folds
+  auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim);
+  auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim);
+  // dictates the number of elements to iterate over
+  // in dimension `dim`
+  auto iter_dim_size = std::min(
+    grad_out_dim_size,
+    (grad_in_dim_size - 1) * step + size
+  );
+  /* prepare grad_out for TensorIterator { */
+  auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec());
+  auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec());
+  grad_out_sizes[dim] = iter_dim_size;
+  auto grad_out_restrided = grad_out.as_strided(
+    grad_out_sizes, grad_out_strides
+  );
+  /* } */
+  /* prepare grad_in for TensorIterator { */
+  auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec());
+  auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec());
+  // set strides for dim to 0
+  // and size to 1 because
+  // this dimension is indexed inside the kernel
+  grad_in_strides[dim] = 0;
+  grad_in_sizes[dim] = 1;
+  grad_in_strides.pop_back();
+  grad_in_sizes.pop_back();
+  auto grad_in_restrided = grad_in.squeeze(-1).as_strided(
+    grad_in_sizes, grad_in_strides
+  );
+  /* } */
+  // During the TensorIterator iteration we have to know
+  // i_dim in grad_out[i_1,...,i_dim,...i_n],
+  // idx_dim stores this information
+  /* prepare idx_dim for TensorIterator { */
+  auto idx_dim = at::arange(
+    0, iter_dim_size, grad_in.options().dtype(at::kLong)
+  );
+  auto grad_out_dim = ensure_nonempty_dim(grad_out.dim());
+  auto idx_dim_strides = std::vector<int64_t>(grad_out_dim, 0);
+  auto idx_dim_sizes = std::vector<int64_t>(grad_out_dim, 1);
+  idx_dim_strides[dim] = 1;
+  idx_dim_sizes[dim] = iter_dim_size;
+  // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator
+  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
+  /* } */
+  auto iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .add_owned_output(grad_out_restrided)
+    .add_owned_input(grad_in_restrided)
+    .add_owned_input(idx_dim_restrided)
+    .build();
+  return iter;
+}
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h ADDED Viewed

	@@ -0,0 +1,506 @@

+#pragma once
+#include <math.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/cpu/utils.h>
+/**
+ * Note [compute_scales_value]
+ * Note [area_pixel_compute_scale]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Interpolate with scale_factor can have different behaviors
+ * depending on the value of recompute_scale_factor:
+ *
+ * - With recompute_scale_factor = True (current default behavior):
+ * the scale_factor, when provided by the user, are used to calculate
+ * the output size. The input size and the computed output_size
+ * are then used to infer new values for the scales which are
+ * used in the interpolation.  Because floating-point math is not exact,
+ * this may be a different value from the user-supplied scales.
+ *
+ * - With recompute_scale_factor = False (which will be the default
+ * behavior starting 1.5.0):
+ * the behavior follows opencv logic, and the scales provided by
+ * the user are the ones used in the interpolation calculations.
+ *
+ * If the scales are not provided or if they are provided but
+ * recompute_scale_factor is set to True (default behavior), the scales
+ * are computed from the input and the output size;
+ *
+ *
+ * When the scales are inferred from the input and output sizes,
+ * we view each pixel as an area, idx + 0.5 as its center index.
+ * Here is an example formula in 1D case.
+ * if align_corners: center of two corner pixel areas are preserved,
+ *     (0.5, 0.5) -> (0.5, 0.5),
+ *     (input_size - 0.5, 0.5) -> (output_size - 0.5)
+ *     scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5)
+ *     src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5)
+ * if not align_corners: the whole range is scaled accordingly
+ *     scale = input_size / output_size
+ *     src_idx + 0.5 = scale * (dst_index + 0.5)
+ */
+namespace at::native {
+namespace upsample {
+TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
+    c10::IntArrayRef input_size,  // Full input tensor size.
+    at::OptionalIntArrayRef output_size,
+    c10::optional<c10::ArrayRef<double>> scale_factors);
+inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
+  if (!scales) {
+    return c10::nullopt;
+  }
+  return scales->at(idx);
+}
+} // namespace upsample
+using scale_t = c10::optional<double>;
+using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w);
+using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);
+static C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+  TORCH_CHECK(
+      input_size.size() == 3,
+      "It is expected input_size equals to 3, but got size ",
+      input_size.size());
+  int64_t output_width = output_size[0];
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_width = input_size[2];
+  TORCH_CHECK(
+      input_width > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (W: ",
+      input_width,
+      ") and output (W: ",
+      output_width,
+      ")");
+  return {nbatch, channels, output_width};
+}
+static C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+  int64_t output_height = output_size[0];
+  int64_t output_width = output_size[1];
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_height = input_size[2];
+  int64_t input_width = input_size[3];
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+  return {nbatch, channels, output_height, output_width};
+}
+static C10_UNUSED
+std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_depth = input_size[2];
+  int64_t input_height = input_size[3];
+  int64_t input_width = input_size[4];
+  TORCH_CHECK(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+          output_depth > 0 && output_height > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (D: ",
+      input_depth,
+      ", H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (D: ",
+      output_depth,
+      ", H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+  return {nbatch, channels, output_depth, output_height, output_width};
+}
+static inline void upsample_2d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t nbatch,
+    int64_t nchannels,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width) {
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+  if (input.defined()) {
+    // Allow for empty batch size but not other dimensions
+    TORCH_CHECK(
+                (input.numel() != 0 ||
+                 (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0)
+                 ) &&
+                input.dim() == 4,
+                "Non-empty 4D data tensor expected but got a tensor with sizes ",
+                input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 4, 0, nbatch);
+    check_dim_size(grad_output, 4, 1, nchannels);
+    check_dim_size(grad_output, 4, 2, output_height);
+    check_dim_size(grad_output, 4, 3, output_width);
+  }
+}
+template <typename scalar_t>
+static inline scalar_t compute_scales_value(
+    const c10::optional<double> scale,
+    int64_t input_size,
+    int64_t output_size) {
+      // see Note [compute_scales_value]
+      // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+      return (scale.has_value() && scale.value() > 0.)
+          ? static_cast<scalar_t>(1.0 / scale.value())
+          : (static_cast<scalar_t>(input_size) / output_size);
+}
+template <typename scalar_t>
+static inline scalar_t area_pixel_compute_scale(
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners,
+    const c10::optional<double> scale) {
+  // see Note [area_pixel_compute_scale]
+  if(align_corners) {
+    if(output_size > 1) {
+      return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
+    } else {
+      return static_cast<scalar_t>(0);
+    }
+  } else {
+    return compute_scales_value<scalar_t>(scale, input_size, output_size);
+  }
+}
+template <typename scalar_t>
+static inline scalar_t area_pixel_compute_source_index(
+    scalar_t scale,
+    int64_t dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    scalar_t src_idx = scale * (dst_index + static_cast<scalar_t>(0.5)) -
+        static_cast<scalar_t>(0.5);
+    // [Note] Follow Opencv resize logic:
+    // We allow negative src_idx here and later will use
+    //   dx = src_idx - floorf(src_idx)
+    // to compute the "distance"(which affects weights).
+    // For linear modes, weight distribution doesn't matter
+    // for negative indices as they use 2 pixels to interpolate.
+    // For example, [-1, 0], they both use pixel 0 value so it
+    // doesn't affect if we bound the src_idx to 0 or not.
+    // TODO: Our current linear mode impls use unbound indices
+    // where we should and then remove this cubic flag.
+    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+    // to interpolate and the weights can be affected.
+    return (!cubic && src_idx < static_cast<scalar_t>(0)) ? scalar_t(0)
+                                                          : src_idx;
+  }
+}
+static inline int64_t nearest_neighbor_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // Index computation matching OpenCV INTER_NEAREST
+  // which is buggy and kept for BC
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf(dst_index * scale)), input_size - 1);
+  return src_index;
+}
+static inline int64_t nearest_neighbor_exact_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // index_f32 = (output_index + 0.5) * scale - 0.5
+  // input_index = round(index_f32)
+  // Same as Pillow and Scikit-Image/Scipy ndi.zoom
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf((dst_index + 0.5) * scale)), input_size - 1);
+  return src_index;
+}
+static inline int64_t nearest_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  // This method specificly treats cases: output_size == input_size or
+  // output_size == 2 * input_size, that we would like to get rid of
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_exact_idx as replacement
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    return output_index;
+  } else if (output_size == 2 * input_size) {
+    // scale_factor = 2, shift input index
+    return output_index >> 1;
+  } else {
+    float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_compute_source_index(scale, output_index, input_size);
+  }
+}
+static inline int64_t nearest_exact_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
+}
+// Define a typedef to dispatch to nearest_idx or nearest_exact_idx
+typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>);
+template <typename scalar_t>
+static scalar_t upsample_get_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  return data[access_y * width + access_x];
+}
+template <typename scalar_t>
+static void upsample_increment_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y,
+    scalar_t value) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  data[access_y * width + access_x] += value;
+}
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  scalar_t A = -0.75;
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+// when `real_input_index` becomes larger than the range the floating point
+// type can accurately represent, the type casting to `int64_t` might exceed
+// `input_size`, causing overflow. So we guard it with `std::min` below.
+template<typename scalar_t, typename opmath_t>
+static inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) {
+  input_index = std::min(static_cast<int64_t>(floorf(real_input_index)), input_size - 1);
+  lambda = std::min(
+      std::max(real_input_index - input_index, static_cast<opmath_t>(0)),
+      static_cast<opmath_t>(1)
+    );
+}
+template<typename scalar_t, typename opmath_t>
+static inline void compute_source_index_and_lambda(
+    int64_t& input_index0,
+    int64_t& input_index1,
+    scalar_t& lambda0,
+    scalar_t& lambda1,
+    opmath_t ratio,
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners) {
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    input_index0 = output_index;
+    input_index1 = output_index;
+    lambda0 = static_cast<scalar_t>(1);
+    lambda1 = static_cast<scalar_t>(0);
+  } else {
+    const auto real_input_index =
+        area_pixel_compute_source_index<opmath_t>(
+            ratio, output_index, align_corners, /*cubic=*/false);
+    guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1);
+    int64_t offset = (input_index0 < input_size - 1) ? 1 : 0;
+    input_index1 = input_index0 + offset;
+    lambda0 = static_cast<scalar_t>(1.) - lambda1;
+  }
+}
+// It will not be used by data types other than BFloat16 and Half.
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  TORCH_CHECK((is_reduced_floating_point_v<scalar_out>),
+              "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.")
+  TORCH_CHECK((std::is_same<scalar_in, float>::value),
+              "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.")
+  return;
+}
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  using bVec = Vectorized<scalar_out>;
+  using fVec = Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec gin_bvec = bVec::loadu(gin + d);
+    fVec gin_fvec0, gin_fvec1;
+    std::tie(gin_fvec0, gin_fvec1) = convert_to_float<scalar_out>(gin_bvec);
+    gin_fvec0 += fVec::loadu(buffer_ptr + d);
+    gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size());
+    fVec(0).store(buffer_ptr + d);
+    fVec(0).store(buffer_ptr + d + fVec::size());
+    convert_from_float<scalar_out>(gin_fvec0, gin_fvec1).store(gin + d);
+  }
+  for (; d < size; d++) {
+    gin[d] += buffer_ptr[d];
+    buffer_ptr[d] = 0;
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h ADDED Viewed

	@@ -0,0 +1,48 @@

+// DON'T include this except from Binary*.cu files. It should not leak into
+// headers.
+#pragma once
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <type_traits>
+namespace at {
+namespace native {
+namespace binary_internal {
+template <typename scalar_t>
+struct DivFunctor {
+  __device__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a / b;
+  }
+};
+template <typename T>
+struct MulFunctor {
+  __device__ T operator()(T a, T b) const {
+    return a * b;
+  }
+};
+// Workaround for the error: '*' in boolean context, suggest '&&' instead
+// [-Werror=int-in-bool-context]
+template <>
+struct MulFunctor<bool> {
+  __device__ bool operator()(bool a, bool b) const {
+    return a && b;
+  }
+};
+void div_true_kernel_cuda(TensorIteratorBase& iter);
+void div_trunc_kernel_cuda(TensorIteratorBase& iter);
+} // namespace binary_internal
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#pragma once
+#include <ATen/native/CompositeRandomAccessorCommon.h>
+#include <thrust/tuple.h>
+namespace at { namespace native {
+struct TupleInfoCPU {
+  template <typename ...Types>
+  using tuple = thrust::tuple<Types...>;
+  template <typename ...Types>
+  static constexpr auto tie(Types&... args) noexcept {
+    return thrust::tie(args...);
+  }
+};
+template <typename KeyAccessor, typename ValueAccessor>
+using CompositeRandomAccessorCPU =
+  CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfoCPU>;
+template <typename Values, typename References>
+void swap(
+  references_holder<Values, References> rh1,
+  references_holder<Values, References> rh2
+) {
+  return thrust::swap(rh1.data(), rh2.data());
+}
+template <int N, typename Values, typename References>
+auto get(references_holder<Values, References> rh) -> decltype(thrust::get<N>(rh.data())) {
+  return thrust::get<N>(rh.data());
+}
+}} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h ADDED Viewed

	@@ -0,0 +1,494 @@

+#include <ATen/Config.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/CuFFTUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+#include <cufft.h>
+#include <cufftXt.h>
+#include <limits>
+#include <list>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+namespace at { namespace native { namespace detail {
+// Enum representing the FFT type
+enum class CuFFTTransformType : int8_t {
+  C2C,  // Complex-to-complex
+  R2C,  // Real-to-complex
+  C2R,  // Complex-to-real
+};
+// This struct is used to let us easily compute hashes of the
+// parameters.
+// It will be the **key** to the plan cache.
+struct CuFFTParams
+{
+  int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3
+  // These include additional batch dimension as well.
+  int64_t sizes_[max_rank + 1];
+  int64_t input_strides_[max_rank + 1];
+  int64_t output_strides_[max_rank + 1];
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+  CuFFTParams() = default;
+  CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_sizes.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+    TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank);
+    std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_);
+    std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_);
+    std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_);
+  }
+};
+static_assert(std::is_trivial<CuFFTParams>::value, "");
+// Returns true if the transform type has complex input
+inline bool cufft_complex_input(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::C2R:
+      return true;
+    case CuFFTTransformType::R2C:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+// Returns true if the transform type has complex output
+inline bool cufft_complex_output(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::R2C:
+      return true;
+    case CuFFTTransformType::C2R:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+// Create transform type enum from bools representing if input and output are complex
+inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) {
+  if (complex_input && complex_output) {
+    return CuFFTTransformType::C2C;
+  } else if (complex_input && !complex_output) {
+    return CuFFTTransformType::C2R;
+  } else if (!complex_input && complex_output) {
+    return CuFFTTransformType::R2C;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported");
+}
+class CuFFTHandle {
+  ::cufftHandle handle_;
+public:
+  CuFFTHandle() {
+    CUFFT_CHECK(cufftCreate(&handle_));
+  }
+  ::cufftHandle & get() { return handle_; }
+  const ::cufftHandle & get() const { return handle_; }
+  ~CuFFTHandle() {
+// Not using fftDestroy() for rocFFT to work around double freeing of handles
+#if !defined(USE_ROCM)
+    cufftDestroy(handle_);
+#endif
+  }
+};
+__forceinline__
+static bool is_pow_of_two(int64_t x) {
+  return (x & (x - 1)) == 0;
+}
+using cufft_size_type = long long int;
+using CuFFTDimVector = c10::SmallVector<cufft_size_type, at::kDimVectorStaticSize>;
+// Struct representing a tensor in CuFFT's data layout for planning transforms
+// See NOTE [ cuFFT Embedded Strides ].
+struct CuFFTDataLayout {
+  CuFFTDimVector embed;
+  cufft_size_type stride, dist;
+  bool must_clone, simple;
+};
+// Returns a cufft embedding for a contiguous signal of the given size.
+// e.g. if the input is cloned, this will be the resulting data layout
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) {
+  CuFFTDataLayout layout;
+  layout.simple = true;
+  layout.must_clone = false;
+  layout.embed.assign(sizes.cbegin() + 1, sizes.cend());
+  if (onesided) {
+    layout.embed.back() = sizes.back() / 2 + 1;
+  }
+  layout.stride = 1;
+  layout.dist = 1;
+  for (const auto& len : layout.embed) {
+    layout.dist *= len;
+  }
+  return layout;
+}
+// Convert strides to a CuFFT embedded representation.
+// If strides cannot be embedded, returns a simple layout and sets must_clone flag
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) {
+  const auto signal_ndim = strides.size() - 1;
+  CuFFTDataLayout layout;
+  auto last_stride = strides[signal_ndim];
+  layout.must_clone = (last_stride <= 0);
+  const auto last_dim_size = onesided ?
+      sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim];
+  const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size;
+  // Zero stides are not allowed, even if the batch size is one.
+  // If that happens just set a dummy case
+  if (sizes[0] == 1) {
+    layout.dist = signal_numel;
+  } else if (strides[0] == 0) {
+    layout.must_clone = true;
+  } else {
+    layout.dist = strides[0];
+  }
+  // Calculate the embedding shape, or set must_clone if the strides cannot be embedded
+  layout.embed.resize(signal_ndim);
+  for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) {
+    auto stride = strides[i];
+    if (sizes[i] == 1) {
+      layout.embed[i] = 1;
+    } else if (stride > 0 && stride % last_stride == 0) {
+      layout.embed[i] = stride / last_stride;
+      last_stride = stride;
+    } else {
+      layout.must_clone = true;
+    }
+  }
+  if (layout.must_clone) {
+    // If the input needs to be cloned, assume it will be contiguous
+    layout = cufft_simple_embed(sizes, onesided);
+    layout.must_clone = true;
+  } else {
+    layout.embed[0] = sizes[1];
+    layout.stride = strides[signal_ndim];
+    // Determine if layout represents a simple embedding (contiguous data)
+    layout.simple = [&] {
+      for (const auto i : c10::irange(1, signal_ndim - 1)) {
+        if (layout.embed[i] != sizes[i + 1]) {
+          return false;
+        }
+      }
+      return (layout.stride == 1 && layout.dist == signal_numel &&
+          layout.embed.back() == last_dim_size);
+    }();
+  }
+  return layout;
+}
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. whether to clone input before executing the plan
+//   3. the workspace size needed
+//
+// This class will be the **value** in the plan cache.
+// It **owns** the raw plan via a unique_ptr.
+class CuFFTConfig {
+public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  CuFFTConfig(const CuFFTConfig&) = delete;
+  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
+  explicit CuFFTConfig(const CuFFTParams& params):
+      CuFFTConfig(
+          IntArrayRef(params.input_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.output_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.sizes_, params.signal_ndim_ + 1),
+          params.fft_type_,
+          params.value_type_) {}
+  // For complex types, strides are in units of 2 * element_size(dtype)
+  // sizes are for the full signal, including batch size and always two-sided
+  CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype):
+        fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end());
+    // input batch size
+    const int64_t batch = sizes[0];
+    const int64_t signal_ndim = sizes.size() - 1;
+    // Since cuFFT has limited non-unit stride support and various constraints, we
+    // use a flag to keep track throughout this function to see if we need to
+    // input = input.clone();
+#if defined(USE_ROCM)
+    // clone input to avoid issues with hipfft clobering the input and failing tests
+    clone_input = true;
+#else
+    clone_input = false;
+#endif
+    // For half, base strides on the real part of real-to-complex and
+    // complex-to-real transforms are not supported. Since our output is always
+    // contiguous, only need to check real-to-complex case.
+    if (dtype == ScalarType::Half) {
+      // cuFFT on half requires compute capability of at least SM_53
+      auto dev_prop = at::cuda::getCurrentDeviceProperties();
+      TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+               "cuFFT doesn't support signals of half type with compute "
+               "capability less than SM_53, but the device containing input half "
+               "tensor only has SM_", dev_prop->major, dev_prop->minor);
+      for (const auto i : c10::irange(signal_ndim)) {
+        TORCH_CHECK(is_pow_of_two(sizes[i + 1]),
+            "cuFFT only supports dimensions whose sizes are powers of two when"
+            " computing in half precision, but got a signal size of",
+            sizes.slice(1));
+      }
+      clone_input |= in_strides.back() != 1;
+    }
+    CuFFTDataLayout in_layout;
+    if (clone_input) {
+      in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R);
+    } else {
+      in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R);
+    }
+    auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C);
+    TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding");
+    clone_input |= in_layout.must_clone;
+    // Check if we can take advantage of simple data layout.
+    //
+    // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+    const bool simple_layout = in_layout.simple && out_layout.simple;
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = cufft_complex_input(fft_type);
+    const auto complex_output = cufft_complex_output(fft_type);
+    if (dtype == ScalarType::Float) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == ScalarType::Double) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == ScalarType::Half) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype);
+    }
+    // disable auto allocation of workspace to use THC allocator
+    CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+    size_t ws_size_t;
+    // make plan
+    if (simple_layout) {
+      // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL.
+      // In such case, cuFFT ignores istride, ostride, idist, and odist
+      // by assuming istride = ostride = 1.
+      //
+      // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+    } else {
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+            in_layout.embed.data(), in_layout.stride, in_layout.dist, itype,
+            out_layout.embed.data(), out_layout.stride, out_layout.dist, otype,
+            batch, &ws_size_t, exec_type));
+    }
+    ws_size = static_cast<int64_t>(ws_size_t);
+  }
+  const cufftHandle &plan() const { return plan_ptr.get(); }
+  CuFFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  bool should_clone_input() const { return clone_input; }
+  int64_t workspace_size() const { return ws_size; }
+private:
+  CuFFTHandle plan_ptr;
+  bool clone_input;
+  int64_t ws_size;
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#if defined(USE_ROCM)
+  // Note that the max plan number for CUDA version < 10 has to be 1023
+  // due to a bug that fails on the 1024th plan
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<int64_t>::max();
+  // The default max cache size chosen for CUDA version > 10 is arbitrary.
+  // This number puts a limit on how big of a plan cache should we maintain by
+  // default. Users can always configure it via cufft_set_plan_cache_max_size.
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<int64_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class CuFFTParamsLRUCache {
+public:
+  using kv_t = typename std::pair<CuFFTParams, CuFFTConfig>;
+  using map_t = typename std::unordered_map<std::reference_wrapper<CuFFTParams>,
+                                            typename std::list<kv_t>::iterator,
+                                            ParamsHash<CuFFTParams>,
+                                            ParamsEqual<CuFFTParams>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+  CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+  CuFFTParamsLRUCache(int64_t max_size) {
+    _set_max_size(max_size);
+  }
+  CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept :
+    _usage_list(std::move(other._usage_list)),
+    _cache_map(std::move(other._cache_map)),
+    _max_size(other._max_size) {}
+  CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  // Return const reference because CuFFTConfig shouldn't be tampered with once
+  // created.
+  const CuFFTConfig &lookup(CuFFTParams params) {
+    AT_ASSERT(_max_size > 0);
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                       std::forward_as_tuple(params),
+                       std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                std::forward_as_tuple(kv_it->first),
+                std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+  size_t size() const { return _cache_map.size(); }
+  size_t max_size() const noexcept { return _max_size; }
+  std::mutex mutex;
+private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    TORCH_CHECK(new_size >= 0,
+             "cuFFT plan cache size must be non-negative, but got ", new_size);
+    TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
+             "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    _max_size = static_cast<size_t>(new_size);
+  }
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+// Since ATen is separated into CPU build and CUDA build, we need a way to call
+// these functions only when CUDA is loaded. We use CUDA hooks for this purpose
+// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual
+// native function counterparts (at native/SpectralOps.cpp), i.e.,
+// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size
+// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache.
+int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index);
+void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size);
+int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index);
+void cufft_clear_plan_cache_impl(DeviceIndex device_index);
+}}} // namespace at::native::detail

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+namespace at { namespace native {
+#if defined(USE_ROCM)
+// take these out when ROCm implements std:: math functions
+#include <math.h>
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val);
+template <>
+__forceinline__ __device__ float device_sqrt(float val) {
+  return ::sqrtf(val);
+}
+template <>
+__forceinline__ __device__ double device_sqrt(double val) {
+  return ::sqrt(val);
+}
+#else
+template<typename scalar_t>
+__forceinline__ __device__ double device_sqrt(scalar_t val) {
+  return std::sqrt(val);
+}
+#endif
+}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h ADDED Viewed

	@@ -0,0 +1,672 @@

+#pragma once
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandBase.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <c10/util/Half.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/core/DistributionsHelper.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <mutex>
+#include <tuple>
+#include <type_traits>
+namespace at {
+namespace native {
+namespace {
+// launch bounds used for kernels utilizing TensorIterator
+const uint32_t block_size_bound = 256;
+const uint32_t grid_size_bound = 4;
+// number of randoms given by distributions like curand_uniform4, curand_uniform2_double
+// used in calculating philox offset.
+const uint32_t curand4_engine_calls = 4;
+// utility function that calculates proper philox_offset
+// for distributions utilizing TensorIterator. For distributions using
+// TensorIterator, we are using a grid-stride loop with each
+// thread yielding one element per thread. For the edge of the grid-stride
+// loop, if the tensor size is large, the unroll loop will kick in and the float4
+// from curand4 will start getting utilized (for common tensor sizes, we end up
+// using rand.x from each thread). Hence, the philox_offset is
+// (number of elements per thread * number of engine calls), which makes
+// sure that philox offset increment is not less than the number of randoms used
+// in each thread.
+std::tuple<uint64_t, dim3, dim3> calc_execution_policy(int64_t total_elements) {
+  const uint64_t numel = static_cast<uint64_t>(total_elements);
+  const uint32_t block_size = block_size_bound;
+  const uint32_t unroll = curand4_engine_calls;
+  dim3 dim_block(block_size);
+  dim3 grid((numel + block_size - 1) / block_size);
+  uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
+  grid.x = std::min(
+      static_cast<uint32_t>(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm,
+      grid.x);
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1)
+                                * curand4_engine_calls;
+  return std::make_tuple(counter_offset, grid, dim_block);
+}
+// grid stride loop kernel for distributions
+template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t>
+C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound)
+__global__ void distribution_elementwise_grid_stride_kernel(int numel,
+                                                            PhiloxCudaState philox_args,
+                                                            const dist_t dist_func,
+                                                            const transform_t transform_func) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+  int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) *
+      blockDim.x * gridDim.x * unroll_factor;
+  for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
+    auto rand = dist_func(&state);
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li < numel) {
+        transform_func(li, static_cast<accscalar_t>((&rand.x)[ii]));
+      }
+    }
+    __syncthreads();
+  }
+}
+/**
+ * distribution_nullary_kernel is analogous to gpu_kernel in
+ * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses
+ * TensorIterator to launch a kernel. However, the differences are
+ *   - it launches a grid-stride loop based kernel. The kernel is not
+ *     generic like elementwise_kernel in Loops.cuh and is specialized
+ *     for the distribution kernels here.
+ *   - For big size tensors, we can launch multiple kernels recursively
+ *     (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox
+ *     offset calculation is done in this function.
+ *
+ * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh
+ * to have grid-stride loop kernel and then use that to launch our distribution
+ * kernels? Note that we need a grid-stride loop kernel because, we found by testing
+ * that it achieves peak effective bandwidth.
+ */
+template<typename scalar_t,
+         typename accscalar_t,
+         int unroll_factor,
+         typename RNG,
+         typename dist_t,
+         typename transform_t>
+void distribution_nullary_kernel(at::TensorIteratorBase& iter,
+                                 RNG gen,
+                                 const dist_t& dist_func,
+                                 const transform_t transform_func) {
+  static_assert(unroll_factor >= 1, "unroll_factor must be >= 1.");
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+  auto execution_policy = calc_execution_policy(numel);
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+  PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
+  }
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, unroll_factor>(sub_iter,
+        gen, dist_func, transform_func);
+    }
+    return;
+  }
+  char* out_data = (char*)iter.data_ptr(0);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (iter.is_trivial_1d()) {
+    auto strides = iter.get_inner_strides();
+    int stride0 = strides[0];
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        scalar_t* out = (scalar_t*)&out_data[stride0 * idx];
+        *out = transform_func(rand);
+      }
+    );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto offset_calc = make_offset_calculator<1>(iter);
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        auto offsets = offset_calc.get(idx);
+        scalar_t* out = (scalar_t*)&out_data[offsets[0]];
+        *out = transform_func(rand);
+      }
+    );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+// Binary kernel
+template <typename func_t, typename inp_offset_calc_t, typename out_offset_calc_t>
+__global__ void distribution_binary_elementwise_kernel(
+    int numel,
+    func_t f,
+    PhiloxCudaState philox_args,
+    typename function_traits<func_t>::result_type *output_data,
+    const typename function_traits<func_t>::template arg<1>::type *input_data_1,
+    const typename function_traits<func_t>::template arg<2>::type *input_data_2,
+    inp_offset_calc_t inp_calc,
+    out_offset_calc_t out_calc) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+  input_t_1 inputs_1[thread_work_size()];
+  input_t_2 inputs_2[thread_work_size()];
+  int base_index = block_work_size() * blockIdx.x;
+  int remaining = std::min<int>(numel - base_index, block_work_size());
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds),
+              blockIdx.x * blockDim.x + threadIdx.x,
+              std::get<1>(seeds),
+              &state);
+  // load data into registers
+  int thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = inp_calc.get(input_idx);
+    inputs_1[i] = input_data_1[offsets[0]];
+    inputs_2[i] = input_data_2[offsets[1]];
+    thread_idx += num_threads();
+  }
+  // compute and store
+  thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = out_calc.get(input_idx);
+    output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
+    thread_idx += num_threads();
+  }
+}
+template <typename func_t>
+void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) {
+  static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t");
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+  using output_t = typename function_traits<func_t>::result_type;
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_binary_kernel(sub_iter, philox_args, f);
+    }
+    return;
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing());
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+  output_t *output_data = static_cast<output_t *>(iter.data_ptr(0));
+  const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
+  const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
+  int64_t grid = (numel + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (iter.is_contiguous()) {
+    distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+} // namespace
+}} // namespace at::native
+namespace at {
+namespace native {
+namespace templates {
+namespace cuda {
+// ==================================================== Random ========================================================
+template<typename RNG>
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) {
+  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] {
+    if ((
+      std::is_same<scalar_t, int64_t>::value ||
+      std::is_same<scalar_t, double>::value ||
+      std::is_same<scalar_t, float>::value ||
+      std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+    {
+      // define lambda to mod with range and add base
+      auto random_func = [range, base] __device__ (uint64_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [range, base] __device__ (uint32_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand4(state);
+        },
+        random_func);
+    }
+   }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+// This is the special kernel to handle single specific case:
+// from(inclusive) = std::numeric_limits<int64_t>::lowest()
+// to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+template<typename RNG>
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] {
+    if (std::is_same<scalar_t, int64_t>::value ||
+        std::is_same<scalar_t, double>::value ||
+        std::is_same<scalar_t, float>::value ||
+        std::is_same<scalar_t, at::BFloat16>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int_full_range<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16");
+    }
+  });
+}
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+    random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
+  }
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+    random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+template<typename RNG>
+void random_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] {
+    if (std::is_same<scalar_t, double>::value || std::is_same<scalar_t, int64_t>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter, gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [] __device__ (uint32_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand4(state);
+        },
+        random_func);
+    }
+  });
+}
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIteratorBase& iter, RNG gen) {
+    random_kernel(iter, gen);
+  }
+};
+// ====================================================================================================================
+template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
+void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform4(state); },
+      transform);
+  }
+}
+template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
+void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal4(state); },
+      transform);
+  }
+}
+// ==================================================== Normal ========================================================
+template<typename RNG>
+void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda to multiply std and add mean
+    auto normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::normal<accscalar_t>(rand, mean, std));
+    };
+    normal_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, normal_func);
+   });
+}
+template<typename RNG>
+struct NormalKernel {
+  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
+    normal_kernel(self, mean, std, check_generator<RNG>(gen));
+  }
+};
+// ==================================================== Uniform ========================================================
+template<typename RNG>
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] {
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto range = static_cast<opmath_t>(to-from);
+    // define lambda to reverse bounds, multiply 'range' and add 'from_'
+    auto uniform_func = [range, from, to] __device__ (opmath_t rand) {
+      // Compute output value before reversing the bounds
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947
+      auto value = static_cast<scalar_t>(rand * range + from);
+      // reverse the bounds of curand4 from (0, 1] to [0, 1)
+      // Note that this method is from legacy THCTensorRandom and is likely to give
+      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+      // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
+      auto reverse_bound_value = value == to ? from : value;
+      return reverse_bound_value;
+    };
+    uniform_and_transform<scalar_t, opmath_t, curand4_engine_calls>(iter, gen, uniform_func);
+   });
+}
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+    uniform_kernel(iter, from, to, check_generator<RNG>(gen));
+  }
+};
+// ================================================== LogNormal =======================================================
+template<typename RNG>
+void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda for log_normal transformation
+    auto log_normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::log_normal<accscalar_t>(transformation::normal<accscalar_t>(rand, mean, std)));
+    };
+    normal_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, log_normal_func);
+   });
+}
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+    log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
+  }
+};
+// =================================================== Geometric ======================================================
+template<typename RNG>
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] {
+    using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+    // define lambda for geometric transformation
+    auto geometric_func = [p] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::geometric<accscalar_t>(rand, p));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, geometric_func);
+  });
+}
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+    geometric_kernel(iter, p, check_generator<RNG>(gen));
+  }
+};
+// ================================================== Exponential =====================================================
+template<typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto lambda = static_cast<accscalar_t>(lambda_);
+    // define lambda for exponential transformation
+    auto exponential_func = [lambda] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::exponential<accscalar_t>(rand, lambda));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, exponential_func);
+   });
+}
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+    exponential_kernel(iter, lambda, check_generator<RNG>(gen));
+  }
+};
+// ==================================================== Cauchy ========================================================
+template<typename RNG>
+void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto median = static_cast<accscalar_t>(median_);
+    auto sigma = static_cast<accscalar_t>(sigma_);
+    // define lambda for cauchy transformation
+    auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::cauchy<accscalar_t>(rand, median, sigma));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, cauchy_func);
+   });
+}
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+    cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
+  }
+};
+// ==================================================== Bernoulli =====================================================
+template<typename scalar_t, typename prob_t>
+void bernoulli_tensor_cuda_kernel(
+    const TensorBase &ret, const at::TensorBase &p,
+    PhiloxCudaState philox_args) {
+  auto functor = [philox_args] __device__(
+          int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
+          const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
+        auto seeds = at::cuda::philox::unpack(philox_args);
+        curandStatePhilox4_32_10_t state;
+        curand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+        // See Note [Register spilling in curand call for CUDA < 10]
+        float4 rand = curand_uniform4(&state);
+        switch (n) {
+          case 4: {
+            CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1);
+            v4 = static_cast<scalar_t>(rand.w <= p4);
+            // fallthrough
+          }
+          case 3: {
+            CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1);
+            v3 = static_cast<scalar_t>(rand.z <= p3);
+            // fallthrough
+          }
+          case 2: {
+            CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1);
+            v2 = static_cast<scalar_t>(rand.y <= p2);
+            // fallthrough
+          }
+          case 1: {
+            CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1);
+            v1 = static_cast<scalar_t>(rand.x <= p1);
+          }
+        }
+      };
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, p, functor);
+}
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
+  PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(10);
+  }
+  TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
+  // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
+  const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
+  auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type));
+  auto p = expand_inplace(self, p_cuda);
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
+      if (std::is_same<scalar_t, double>::value) {
+        return bernoulli_tensor_cuda_kernel<double, double>(self, *p, rng_engine_inputs);
+      } else {
+        return bernoulli_tensor_cuda_kernel<scalar_t, float>(self, *p, rng_engine_inputs);
+      }
+   });
+}
+template<typename RNG>
+void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] {
+      using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+      // define lambda for bernoulli transformation
+      auto bernoulli_func = [p] __device__ (accscalar_t rand) {
+        return static_cast<scalar_t>(transformation::bernoulli<accscalar_t>(rand, p));
+      };
+      uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, bernoulli_func);
+   });
+}
+template<typename RNG>
+struct BernoulliKernel {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+    bernoulli_kernel(iter, p, check_generator<RNG>(gen));
+  }
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
+  }
+};
+}}}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Distributions.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+namespace at {
+struct CUDAGeneratorImpl;
+struct TensorIteratorBase;
+class TensorBase;
+namespace native {
+void launch_poisson_cuda_kernel(
+    const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen);
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen);
+void launch_binomial_cuda_kernel(
+    TensorIteratorBase &iter, CUDAGeneratorImpl *gen);
+void launch_dirichlet_kernel(TensorIteratorBase &iter);
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter);
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter);
+}}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/TensorUtils.h>
+namespace at {
+namespace native {
+Tensor embedding_backward_cuda_kernel(
+    const Tensor &grad,
+    const Tensor &orig_indices,
+    const Tensor &sorted_indices,
+    const Tensor &count,
+    int64_t num_weights,
+    int padding_idx = -1,
+    bool mode_mean = false,
+    const Tensor &offset2bag = Tensor(),
+    const Tensor &bag_size = Tensor(),
+    const Tensor &per_sample_weights = Tensor());
+}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh ADDED Viewed

	@@ -0,0 +1,681 @@

+#pragma once
+#include <ATen/OpMathType.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <ATen/native/cuda/Pow.cuh>
+namespace at::native {
+namespace {
+// TODO(crcrpar): Handle version bump in codegen.
+// rel:
+// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482
+inline void increment_version(TensorList tensors) {
+  for (const auto& t : tensors) {
+    t.unsafeGetTensorImpl()->bump_version();
+  }
+}
+// Initializes args and checks if all args are aligned
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    TensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+// Initializes args and checks if all args are aligned
+template <int depth, typename T, typename T2>
+__device__ bool init_args(
+    T** args,
+    TensorListScalarListMetadata<T2, depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    FusedOptimizerTensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+template <int depth, typename T>
+__device__ void load_args(
+    T r_args[][kILP],
+    T** args,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const auto i = i_start + threadIdx.x + ii * blockDim.x;
+    for (int r_index = 0; r_index < depth; r_index++) {
+      r_args[r_index][ii] = 0;
+      if (i < n && i < chunk_size) {
+        r_args[r_index][ii] = args[r_index][i];
+      }
+    }
+  }
+}
+template <typename T>
+__device__ void store_args(
+    T* dst,
+    T* src,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const int64_t i = i_start + threadIdx.x + ii * blockDim.x;
+    if (i < n && i < chunk_size)
+      dst[i] = src[ii];
+  }
+}
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void binary_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args
+      // has depth 1
+      load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void pointwise_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+      load_store(r_args[1], args[1], 0, i_start);
+      load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args
+      // has depth 3
+      load_args<3>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+//
+// Binary Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpListAlphaFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarTensorFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      T* scalar,
+      opmath_t alpha) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        // Regardless if depth is 1 (for inplace) or 2 (for out of place),
+        // r_args has depth 1
+        load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+//
+// Unary Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct ZeroFunctor {
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<1>& tl) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const auto all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        // store
+        load_store(args[0], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct UnaryOpFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+//
+// Pointwise Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+template <typename T, int depth>
+struct PointwiseOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[depth - 1][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[2], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth - 1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[2], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    static_assert(depth == 3 || depth == 4, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+        load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    static_assert(depth == 2 || depth == 3, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+template <typename T>
+struct power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(a, b);
+  }
+};
+template <typename T>
+struct reverse_power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(b, a);
+  }
+};
+} // namespace
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <ATen/NumericUtils.h>
+namespace at::native {
+// std:: does not have clamp functors
+template <typename T>
+struct minimum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a < b) ? a : b;
+  }
+};
+template <typename T>
+struct maximum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a > b) ? a : b;
+  }
+};
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh ADDED Viewed

	@@ -0,0 +1,321 @@

+#pragma once
+#include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/native/GridSamplerUtils.h>
+namespace at { namespace native {
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
+// except that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
+                                           bool align_corners, scalar_t *grad_in) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    *grad_in = static_cast<scalar_t>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    *grad_in = static_cast<scalar_t>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates(scalar_t in, int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+}
+// clip_coordinates_set_grad works similarly to clip_coordinates except that
+// it also returns the `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
+  // Note that it is important for the gradient calculation that borders
+  // are considered out of bounds.
+  if (in <= static_cast<scalar_t>(0)) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  } else {
+    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<scalar_t>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<scalar_t>(1);
+      return in;
+    }
+  }
+}
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+// reflect_coordinates_set_grad works similarly to reflect_coordinates except
+// that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
+                                      scalar_t *grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  }
+  int grad_in_mult_;
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<scalar_t>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  // `fmod` returns same sign as `in`, which is positive after the `if` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<scalar_t>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t safe_downgrade_to_int_range(scalar_t x){
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t compute_coordinates(scalar_t coord, int size,
+                             GridSamplerPadding padding_mode,
+                             bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2*(size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2*size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+// grid_sampler_compute_source_index_set_grad works similarly to
+// grid_sampler_compute_source_index except that it also returns the
+// `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index_set_grad(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    scalar_t *grad_in) {
+  scalar_t grad_clip, grad_refl;
+  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
+    } else {
+      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+static __forceinline__ __device__
+bool within_bounds_2d(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+static __forceinline__ __device__
+bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t get_value_bounded(
+    scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_2d(scalar_t *data, int h, int w,
+                 int sH, int sW, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_2d(h, w, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_3d(scalar_t *data, int d, int h, int w,
+                 int sD, int sH, int sW, int D, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_3d(d, h, w, D, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + d * sD + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void add_value_bounded(
+    scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    scalar_t delta,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    const index_t NC_offset,
+    const index_t memory_span) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+  safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span);
+}
+// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
+template<typename scalar_t>
+static __forceinline__ __device__
+void get_cubic_coefficients_grad(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  // Must be the same as forward calculation in
+  // aten/src/ATen/native/cuda/UpSample.cuh:get_cubic_upsample_coefficients
+  scalar_t A = -0.75;
+  scalar_t x;
+  x = -1 - t;  // 1 < x = |-1 - tx| < 2
+  coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A;
+  x = -t;     // x = |0 - tx| <= 1
+  coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 1 - t;  // x = |1 - tx| <= 1
+  coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 2 - t;  // 1 < x = |2 - tx| < 2
+  coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A;
+}
+}}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh ADDED Viewed

	@@ -0,0 +1,187 @@

+#pragma once
+#include <ATen/jit_macros.h>
+#if AT_USE_JITERATOR()
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/CUDAJitLoops.cuh>
+namespace at {
+namespace native {
+/* Note [Jiterator]
+The "jiterator" simply just-in-time compiles the same kernels that
+Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time,
+build size, and initial CUDA context size.
+By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels.
+This behavior is controlled with two environment variables:
+  - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use
+  - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels
+The jiterator currently has some limitations, however. It cannot:
+  - handle math on complex datatypes
+  - handle kernels with scalar parameters
+These improvements will likely come soon.
+For examples of how to use the jiterator see the i1 and gcd kernel
+implementations, which pass jittable strings implementing their
+operations instead of the typical CUDA functors.
+To pass a runtime argument (similar to lambda captures in non-JIT kernels),
+we need to pass to additional arguments to `jitted_gpu_kernel` by value.
+Currently only primitive C++ types used for computation are valid.
+The order of these extra arguments should be same as the order they appear
+in kernel's function signature. (look at polygamma for example)
+NOTE: One big restriction being that these arguments should be after the
+arguments provided by TensorIterator. Eg. While capturing `n`, where
+`scalar_t x` and `scalar_t y` are provided by TensorIterator,
+* foo(scalar_t x, scalar_t y, int n) works!
+* foo(int n, scalar_t x, scalar_y) doesn't work
+* foo(scalar_t x, int n, scalar_y) doesn't work
+*/
+// Entrypoint for jitted GPU kernels.
+// Only handles elementwise unary and binary kernels with a
+//   common dtype and a single output.
+// NOTE: this assumes the op's iterator has a common_dtype.
+// NOTE: We use std::tuple instead of parameter pack
+//  for `extra_args` due to following
+// bug on older versions of clang
+// https://bugs.llvm.org/show_bug.cgi?id=23029
+template <
+    char const* name,
+    typename return_type,
+    typename f_inputs_type,
+    int arity,
+    typename... Args>
+void jitted_gpu_kernel(
+    TensorIteratorBase& iter,
+    const std::string& f,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    at::opmath_type<f_inputs_type> scalar_val = 0,
+    std::tuple<Args...> extra_args = std::make_tuple()) {
+  // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel
+  //   Maybe it could be refactored?
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+  if (iter.numel() == 0) {
+    return;
+  }
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      jitted_gpu_kernel<name, return_type, f_inputs_type, arity>(
+          sub_iter, f, scalar_pos, scalar_val, extra_args);
+    }
+    return;
+  }
+  // Computes if dynamic casting is needed
+  // Dynamic casting is needed if an input's dtype differs from the common dtype
+  //   or if the result dtype differs from the output's dtype
+  // Note: this is intentionally divergent from calling needs_dynamic_casting,
+  //   which is more general and inspects a lambda to determine if dynamic
+  //   casting is needed.
+  bool needs_dynamic_casting = false;
+  // Checks output
+  const ScalarType return_scalar_type = c10::CppTypeToScalarType<return_type>::value;
+  const auto dtype0 = iter.dtype(0);
+  if (dtype0 != return_scalar_type) {
+    needs_dynamic_casting = true;
+  }
+  // Checks input(s)
+  const ScalarType inputs_scalar_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  for (auto i = decltype(arity){1}; i < (arity + 1); ++i) {
+    const auto dtypei = iter.dtype(i);
+    if (dtypei != inputs_scalar_type) {
+      needs_dynamic_casting = true;
+      break;
+    }
+  }
+  if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) {
+    // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used
+    // for computation in the generated code and hence we pass a dummy
+    // value of `0`.
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::NoScalar>(
+        iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args);
+  } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::RhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+  } else {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::LhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+  }
+}
+// TODO: support runtime state capture similar to `jitted_gpu_kernel`.
+template <char const *name, typename return_type, typename f_inputs_type>
+void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+  //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type)
+  using opmath_t = at::opmath_type<f_inputs_type>;
+  if (iter.is_cpu_scalar(1)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(1);
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val);
+  } else if (iter.is_cpu_scalar(2)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(2);
+    iter.remove_operand(2);
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val);
+  } else {
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 2>(iter, f);
+  }
+}
+}}  // at::native
+#endif // AT_USE_JITERATOR()