koichi12 commited on Feb 12, 2025

Commit

445c885

verified ·

1 Parent(s): a8eed2c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe +3 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py +1451 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py +105 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py +506 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py +98 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py +220 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py +277 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py +419 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py +344 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h +401 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h +61 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h +100 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h +393 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h +321 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h +139 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h +21 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h +72 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h +11 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h +69 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h +56 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h +55 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h +49 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h +2 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h +57 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h +62 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh +296 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h +32 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh +384 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh +379 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h +61 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h +17 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh +40 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh +38 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh +40 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh +38 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh +680 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h +394 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h +12 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h +103 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h +130 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h +62 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h +147 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h +29 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h +527 -0

.gitattributes CHANGED Viewed

@@ -77,3 +77,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/_
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text

 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc4c06b7d95e74e315419ee7e88e1d0f71e9e9477538c00a93a9ff8c66a6cfc
+size 182784

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc ADDED Viewed

Binary file (13.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py ADDED Viewed

	@@ -0,0 +1,1451 @@

+import contextlib
+import functools
+import logging
+import os
+import sys
+import time
+import warnings
+from itertools import count
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    FrozenSet,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+from unittest import mock
+from functorch.compile import min_cut_rematerialization_partition
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dynamo import (
+    compiled_autograd,
+    config as dynamo_config,
+    logging as dynamo_logging,
+    utils as dynamo_utils,
+)
+from torch._dynamo.utils import (
+    counters,
+    detect_fake_mode,
+    lazy_format_graph_code,
+    optimus_scuba_log,
+)
+from torch._functorch.aot_autograd import aot_export_module, make_boxed_func
+from torch._inductor.codecache import code_hash, CompiledFxGraph, FxGraphCache
+from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.utils import BoxedBool, count_tangents
+from torch._logging import trace_structured
+from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._utils_internal import signpost_event
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from .._dynamo.backends.common import aot_autograd
+from ..fx._lazy_graph_module import _use_lazy_graph_module  # type: ignore[attr-defined]
+from ..fx.graph import _PyTreeCodeGen
+from . import config, metrics
+from .debug import DebugContext
+from .decomposition import select_decomp_table
+from .fx_passes.joint_graph import joint_graph_passes
+from .fx_passes.post_grad import post_grad_passes, view_to_reshape
+from .fx_passes.pre_grad import pre_grad_passes
+from .graph import GraphLowering
+from .ir import ExternKernelNode
+from .utils import get_dtype_size, has_incompatible_cudagraph_ops, output_node
+from .virtualized import V
+if config.is_fbcode():
+    from torch._inductor.fb.utils import time_and_log
+else:
+    # no-op decorator
+    def time_and_log(attr: str, extra_loggings: Optional[Dict[str, str]] = None):
+        return dynamo_utils.identity
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+post_grad_graphs_log = torch._logging.getArtifactLogger(__name__, "post_grad_graphs")
+ALIGNMENT = 16
+# copy_ fails when trying to write to tensors with memory overlap,
+# for expanded dimensions (a dimension which used to have size 1 -> ?)
+# we can select one element from that dimension and write to it
+# to achieve writing to all values of that dimension of the input tensor
+def get_expanded_dims(t):
+    if not isinstance(t, torch.Tensor):
+        return None
+    return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
+def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tensor:
+    for expanded_dim in expanded_dims:
+        t = torch.ops.aten.slice(t, expanded_dim, 0, 1)
+    return t
+def complex_memory_overlap(t: torch.Tensor) -> bool:
+    # if torch._debug_has_internal_overlap thinks this tensor potentially has
+    # memory overlap internally, let's dig deeper to find out whether it's true.
+    t = index_expanded_dims(t, get_expanded_dims(t))
+    if torch._debug_has_internal_overlap(t) != 0:
+        strides = t.stride()
+        sizes = t.shape
+        indices = list(range(len(strides)))
+        indices = [x for _, x in sorted(zip(strides, indices))]
+        for i in range(len(strides)):
+            prev_stride = 1 if i == 0 else strides[indices[i - 1]]
+            prev_size = 1 if i == 0 else sizes[indices[i - 1]]
+            if strides[indices[i]] < prev_stride * prev_size:
+                return True
+    return False
+@functools.lru_cache(None)
+def _step_logger():
+    return dynamo_logging.get_step_logger(log)
+@functools.lru_cache(None)
+def _warn_tf32_disabled():
+    if (
+        torch.cuda.is_available()
+        and not torch.backends.cuda.matmul.allow_tf32
+        and torch.cuda.get_device_capability() >= (8, 0)
+    ):
+        warnings.warn(
+            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. "
+            "Consider setting `torch.set_float32_matmul_precision('high')` for better performance."
+        )
+def _unlift_graph(mod, gm, graph_signature):
+    from torch.export.unflatten import _assign_attr, _AttrKind
+    state_dict = {}
+    for name, param in mod.named_parameters(remove_duplicate=False):
+        state_dict[name] = param
+        _assign_attr(
+            param,
+            gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        state_dict[name] = buffer
+        _assign_attr(
+            buffer,
+            gm,
+            name,
+            attr_kind=_AttrKind.BUFFER,
+        )
+    placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    lifted_inputs = []
+    for node in placeholder_nodes:
+        node_name = node.name
+        if node_name in graph_signature.inputs_to_parameters:
+            lifted_inputs.append(graph_signature.inputs_to_parameters[node_name])
+        elif node_name in graph_signature.inputs_to_buffers:
+            lifted_inputs.append(graph_signature.inputs_to_buffers[node_name])
+        else:
+            assert node_name in graph_signature.user_inputs
+            lifted_inputs.append(None)
+    from torch.export._unlift import _unlift
+    outputs = list(gm.graph.nodes)[-1].args[0]
+    mutated_outputs = []
+    for out in outputs:
+        if out in graph_signature.buffers_to_mutate:
+            mutated_outputs.append(graph_signature.buffers_to_mutate[out.name])
+        else:
+            mutated_outputs.append(None)
+    unlifted_gm = _unlift(
+        gm,
+        lifted_inputs,
+        mutated_outputs,
+        pytree.LeafSpec(),
+        None,
+        state_dict,
+        {},
+    )
+    return unlifted_gm
+def _get_subgraph_names(gm):
+    for node in gm.graph.nodes:
+        if node.target == torch.ops.higher_order.cond:
+            true_subgraph_name = node.args[1].name
+            false_subgraph_name = node.args[2].name
+            yield true_subgraph_name
+            yield false_subgraph_name
+def _recursive_pre_grad_passes(gm, example_inputs):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        # as we don't have recursive example inputs, passing None here
+        new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None)
+        setattr(gm, subgraph_name, new_subgraph)
+    return pre_grad_passes(gm, example_inputs)
+def _recursive_joint_graph_passes(gm):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_joint_graph_passes(subgraph)
+    joint_graph_passes(gm)
+def _recursive_post_grad_passes(gm, is_inference: bool = False):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_post_grad_passes(subgraph, is_inference)
+    post_grad_passes(gm, is_inference)
+def split_const_gm(
+    gm: torch.fx.GraphModule,
+) -> Tuple[torch.fx.GraphModule, Dict[str, int]]:
+    """
+    This function takes an GraphModule input "gm".
+    The gm will be split into 2 components,
+      1) const_gm, which consists the subgraph of gm that can be constant folded.
+      2) gm (being inplace modified,) which returns the graph after constant folding.
+    const_output_index is a mapping of corresponding node name from gm to the
+    output index of const_gm.
+    Returns (const_gm, const_output_index)
+    """
+    from torch._inductor.constant_folding import (
+        CONST_MODULE_TAG,
+        META_TAG,
+        MODULE_TAG,
+        replace_node_with_constant,
+        run_and_get_constant_graph,
+    )
+    const_gm = run_and_get_constant_graph(gm)
+    const_result = const_gm()
+    const_outputs = {
+        x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
+    }
+    to_erase_node = []
+    to_replace_node = []
+    const_output_index = {}
+    for node in gm.graph.nodes:
+        if node.name in const_outputs:
+            to_replace_node.append(node)
+        elif node.meta[META_TAG] == CONST_MODULE_TAG:
+            to_erase_node.append(node)
+    for node in to_replace_node:
+        new_const_name = "_FOLDED_CONST_" + node.name
+        replace_node_with_constant(
+            gm,
+            node,
+            const_result[const_outputs[node.name]],
+            new_const_name,
+        )
+        const_output_index[new_const_name] = const_outputs[node.name]
+    for node in to_erase_node[::-1]:
+        if node.users:
+            for n in node.users:
+                assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty."
+        else:
+            gm.graph.erase_node(node)
+    gm.recompile()
+    return const_gm, const_output_index
+def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
+    aten = torch.ops.aten
+    tf32_ops = {
+        aten.mm.default,
+        aten.addmm.default,
+        aten.bmm.default,
+        aten.baddbmm.default,
+    }
+    for node in gm.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target in tf32_ops
+            and isinstance(node.meta.get("val", None), torch.Tensor)
+            and node.meta["val"].dtype == torch.float32
+            and node.meta["val"].device.type == "cuda"
+        ):
+            return True
+    return False
+@DebugContext.wrap
+def count_bytes_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    num_fixed: int = 0,
+    **kwargs,
+):
+    shape_env = _shape_env_from_inputs(example_inputs)
+    fake_mode = fake_tensor_prop(gm, example_inputs)
+    with V.set_fake_mode(fake_mode):
+        _recursive_post_grad_passes(gm, False)
+    graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
+    with V.set_graph_handler(graph), V.set_real_inputs(example_inputs):
+        graph.run(*example_inputs)
+        num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+        metrics.num_bytes_accessed += num_bytes
+        metrics.nodes_num_elem += nodes_num_elem
+        metrics.node_runtimes += node_runtimes
+    return make_boxed_func(gm.forward)
+def fake_tensor_prop(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    force_allow_non_fake_inputs: bool = False,
+):
+    """
+    If we can not detect fake mode from the context of inputs, create one.
+    The created fake mode will be returned.
+    """
+    fake_mode = detect_fake_mode(example_inputs)
+    if not fake_mode:
+        fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        FakeTensorProp(gm, mode=fake_mode).propagate(*example_inputs)
+    else:
+        ctx = (
+            contextlib.nullcontext()
+            if not force_allow_non_fake_inputs
+            else mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+        )
+        with ctx:  # type: ignore[attr-defined]
+            FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(
+                *example_inputs
+            )
+    return fake_mode
+# pass config dict back to user
+def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
+    with config.patch(config_patches):
+        return config.get_config_copy()
+@DebugContext.wrap
+@torch.utils._python_dispatch._disable_current_modes()
+@time_and_log(
+    attr="compilation time (in seconds)",
+    extra_loggings={"config_dict": str(get_patched_config_dict())},
+)
+# Need this decorator for compile_fx_inner even if we already have one for
+# compile_fx. The reason is the compilation for backward graph may happen after
+# compile_fx return and we may want to use the _LazyGraphModule for compiling
+# the backward graph as well.
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+@dynamo_utils.dynamo_timed(phase_name="inductor_compile")
+def compile_fx_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    """
+    Inductor API that compiles a single graph.
+    If you change the argument list for this function, make sure you
+    also update the call to save_args_for_compile_fx_inner below accordingly.
+    """
+    if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+        _LazyGraphModule.force_recompile(gm)
+        return make_boxed_func(gm.forward)
+    assert isinstance(
+        next(iter(reversed(gm.graph.nodes))).args[0], (tuple, list)
+    ), f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}"
+    if config.save_args:
+        save_args_for_compile_fx_inner(
+            gm,
+            example_inputs,
+            cudagraphs=cudagraphs,
+            num_fixed=num_fixed,
+            is_backward=is_backward,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            is_inference=is_inference,
+            boxed_forward_device_index=boxed_forward_device_index,
+            user_visible_outputs=user_visible_outputs,
+            layout_opt=layout_opt,
+        )
+    if cudagraphs is None:
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+    # Inputs to fx_codegen_and_compile
+    # Anything that affects codegen should go here, so if the signature
+    # of fx_codegen_and_compile changes, the dict should be updated accordingly
+    graph_kwargs = {
+        "cudagraphs": cudagraphs,
+        "num_fixed": num_fixed,
+        "is_backward": is_backward,
+        "graph_id": graph_id,
+        "cpp_wrapper": cpp_wrapper,
+        "aot_mode": aot_mode,
+        "is_inference": is_inference,
+        "user_visible_outputs": user_visible_outputs,
+        "layout_opt": layout_opt,
+        "extern_node_serializer": extern_node_serializer,
+    }
+    start = time.time()
+    if config.fx_graph_cache and not aot_mode:
+        compiled_graph = FxGraphCache.load(
+            fx_codegen_and_compile, gm, example_inputs, graph_kwargs
+        )
+    else:
+        compiled_graph = fx_codegen_and_compile(
+            gm, example_inputs, **graph_kwargs  # type: ignore[arg-type]
+        )
+    log.debug("FX codegen and compilation took %.3fs", time.time() - start)
+    # check cudagraph disabling reasons from inductor lowering
+    if cudagraphs and compiled_graph.disabled_cudagraphs_reason:
+        perf_hint_log.warning(
+            "skipping cudagraphs due to %s", compiled_graph.disabled_cudagraphs_reason
+        )
+        BoxedBool.disable(cudagraphs)
+    # Return the output strides to the caller via TracingContext
+    context = torch._guards.TracingContext.try_get()
+    if context is not None and context.output_strides is not None:
+        assert len(context.output_strides) == 0
+        context.output_strides.extend(compiled_graph.output_strides)
+    if aot_mode:
+        return compiled_graph
+    if cudagraphs:
+        # output args are tuple of first argument
+        output = output_node(gm)
+        assert len(output.args) == 1
+        stack_traces = [
+            (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+            for arg in output.args[0]
+        ]
+        complex_memory_overlap_inputs = any(
+            complex_memory_overlap(t)
+            for t in example_inputs
+            if isinstance(t, torch.Tensor)
+        )
+        from torch._inductor.cudagraph_utils import check_for_mutation
+        has_mutation_str = check_for_mutation(gm, compiled_graph, num_fixed)
+        has_mutation = has_mutation_str is not None
+        if has_mutation:
+            compiled_graph.disabled_cudagraphs_reason = has_mutation_str
+        cudagraph_tests = [
+            (not has_mutation, "mutated inputs"),
+            (not has_incompatible_cudagraph_ops(gm), "incompatible ops"),
+            (not complex_memory_overlap_inputs, "complex memory overlap"),
+            (
+                all(
+                    isinstance(t, (torch.Tensor, torch.SymInt)) for t in example_inputs
+                ),
+                "non-Tensor inputs",
+            ),
+        ]
+        cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b]
+        if not cudagraph_fail_reasons:
+            if not config.triton.cudagraph_trees:
+                # Force specialize all inputs so that CUDA graphs will work
+                for t in example_inputs:
+                    if isinstance(t, torch.SymInt):
+                        int(t)  # guard
+            if (
+                boxed_forward_device_index is not None
+                and not is_inference
+                and not is_backward
+            ):
+                boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
+            compiled_graph.current_callable = cudagraphify(
+                compiled_graph.get_current_callable(),
+                example_inputs,
+                static_input_idxs=range(num_fixed),
+                device_index=next(iter(compiled_graph.device_idxs)),
+                stack_traces=stack_traces,
+                is_backward=is_backward,
+                is_inference=is_inference,
+                constants=tuple(compiled_graph.constants.values()),
+            )
+        else:
+            BoxedBool.disable(cudagraphs)
+            # See [Backward Generation Handling]
+            # if cudagraph'd the forward and set the device, we need to let the cudagraph manager
+            # know we are we running the backward even if we will not run it in cudagraphs
+            if is_backward and config.triton.cudagraph_trees:
+                assert boxed_forward_device_index is not None
+                assert boxed_forward_device_index.value is not None
+                compiled_graph_callable = compiled_graph.get_current_callable()
+                manager = torch._inductor.cudagraph_trees.get_manager(
+                    boxed_forward_device_index.value, create_if_none_exists=False
+                )
+                # should already exist from forward
+                assert manager is not None
+                def compiled_artifact(new_inputs):
+                    manager.set_to_running_backward()
+                    return compiled_graph_callable(new_inputs)
+                compiled_graph.current_callable = compiled_artifact
+            if "cuda" in compiled_graph.device_types:
+                # prefer better disable_cudagraphs_reason bc stack trace
+                # TODO: migrate all disable reasons to stack trace, refactor
+                if compiled_graph.disabled_cudagraphs_reason:
+                    perf_hint_log.warning(compiled_graph.disabled_cudagraphs_reason)
+                else:
+                    perf_hint_log.warning(
+                        "skipping cudagraphs due to %s", cudagraph_fail_reasons
+                    )
+    # cudagraphs does its own aligning of inputs
+    if not cudagraphs:
+        new_callable = align_inputs(
+            compiled_graph.get_current_callable(), example_inputs, range(num_fixed)
+        )
+        if new_callable is not compiled_graph.get_current_callable():
+            compiled_graph.current_callable = new_callable
+    _step_logger()(
+        logging.INFO,
+        "torchinductor done compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+    # aot autograd needs to know to pass in inputs as a list
+    compiled_graph._boxed_call = True
+    return compiled_graph
+def fx_codegen_and_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    if is_tf32_warning_applicable(gm):
+        _warn_tf32_disabled()
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000))
+    _step_logger()(
+        logging.INFO,
+        "torchinductor compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+    V.debug.fx_graph(gm, example_inputs)
+    # TODO: Should we actually dump this?  It should be redundant with the aot
+    # structured logs...
+    # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
+    shape_env = _shape_env_from_inputs(example_inputs)
+    # Convert view to reshape in the graph. This is necessary primarily for
+    # layout optimization. Do it unconditionally for uniformity.
+    #
+    # It's needed because when we do layout optimization, an contiguous tensor
+    # in eager mode may becomes a channels last tensor. A view op previously
+    # can be applied to the contiguous tensor may not be able to be applied
+    # on the channels tensor any more. An error like
+    #   RuntimeError: view size is not compatible with input tensor's size and stride
+    #   (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+    # will be printed.
+    #
+    # Replace view op to reshape op in this case.
+    # As an example, timm_resnest/botnet26t_256/convnext_base etc. will fail if we don't do this.
+    #
+    # Also this has to be done before FakeTensorProp below to avoid the failed
+    # .view() call.
+    view_to_reshape(gm)
+    # It is safe to run FakeTensorProp under no_grad because by the time
+    # we're in inductor, we assume that AOTAutograd has already "taken care"
+    # of autograd, so there should be no more autograd-related API's in the
+    # graph.
+    with torch.no_grad():
+        fake_mode = fake_tensor_prop(gm, example_inputs)
+    # pattern matcher passes might not preserve striding information
+    # on node.meta["val"]. if in the future we rely on these being
+    # correct we will need to fix.
+    with V.set_fake_mode(fake_mode):
+        # has some issues with memory in training
+        _recursive_post_grad_passes(gm, is_inference=is_inference)
+        V.debug.fx_graph_transformed(gm, example_inputs)
+        post_grad_graphs_log.debug("%s", lazy_format_graph_code("AFTER POST GRAD", gm))
+        trace_structured(
+            "inductor_post_grad_graph",
+            payload_fn=lambda: gm.print_readable(print_output=False),
+        )
+        optimus_scuba_log["inductor_post_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.post_grad_passes",
+            optimus_scuba_log,
+        )
+    with V.set_fake_mode(fake_mode):
+        const_output_index = None
+        const_graph = None
+        const_code = None
+        if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+            const_gm, const_output_index = split_const_gm(gm)
+            const_graph = GraphLowering(
+                const_gm,
+                example_inputs=[],
+                shape_env=shape_env,
+                num_static_inputs=num_fixed,
+                graph_id=graph_id,
+                cpp_wrapper=cpp_wrapper,
+                aot_mode=aot_mode,
+                user_visible_outputs=user_visible_outputs,
+                extern_node_serializer=extern_node_serializer,
+                is_inference=is_inference,
+                is_const_graph=True,
+            )
+            with V.set_graph_handler(const_graph):
+                assert cpp_wrapper, "AOT mode only supports C++ wrapper"
+                const_graph.run()
+                const_code, _ = const_graph.codegen_with_cpp_wrapper()
+        graph = GraphLowering(
+            gm,
+            # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
+            # For the forward pass, we have the real inputs to be used as example_inputs. For the backward pass,
+            # we currently use fake tensors and defake them later.
+            example_inputs=example_inputs,
+            shape_env=shape_env,
+            num_static_inputs=num_fixed,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            user_visible_outputs=user_visible_outputs,
+            extern_node_serializer=extern_node_serializer,
+            is_inference=is_inference,
+            const_output_index=const_output_index,
+            const_code=const_code,
+            const_module=const_graph,
+        )
+        with V.set_graph_handler(graph):
+            graph.run(*example_inputs)
+            output_strides: List[Optional[Tuple[int, ...]]] = []
+            if graph.graph_outputs is not None:
+                # We'll put the output strides in the compiled graph so we
+                # can later return them to the caller via TracingContext
+                for out in graph.graph_outputs:
+                    if hasattr(out, "layout"):
+                        output_strides.append(
+                            tuple(
+                                V.graph.sizevars.size_hint(s) for s in out.layout.stride
+                            )
+                        )
+                    else:
+                        output_strides.append(None)
+            metrics_helper = metrics.CachedMetricsHelper()
+            compiled_fn = graph.compile_to_fn()
+            if V.aot_compilation is True:
+                return compiled_fn
+            if cudagraphs and not V.graph.disable_cudagraphs_reason:
+                from torch._inductor.cudagraph_utils import (
+                    check_lowering_disable_cudagraph,
+                )
+                V.graph.disable_cudagraphs_reason = check_lowering_disable_cudagraph(
+                    V.graph.device_node_mapping
+                )
+            compiled_graph = CompiledFxGraph(
+                compiled_fn,
+                graph,
+                output_strides,
+                V.graph.disable_cudagraphs_reason,
+                metrics_helper.get_deltas(),
+            )
+    return compiled_graph
+def clone_preserve_strides(x: torch.Tensor):
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.as_strided(x, (needed_size,), (1,)).clone()
+    return torch.as_strided(buffer, x.size(), x.stride())
+def copy_misaligned_inputs(
+    new_inputs: List[torch.Tensor], check_inputs_idxs: Sequence[int]
+) -> None:
+    for i in check_inputs_idxs:
+        if new_inputs[i].data_ptr() % ALIGNMENT:
+            new_inputs[i] = clone_preserve_strides(new_inputs[i])
+def get_input_idxs_to_check(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+) -> Sequence[int]:
+    def is_aligned(storage_offset, dtype):
+        return (storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0
+    ids_to_check = []
+    for i, input in enumerate(inputs):
+        if (
+            isinstance(input, torch.Tensor)
+            and (
+                i not in static_input_idxs
+                or not is_aligned(input.storage_offset(), input.dtype)
+            )
+            and input.device.type == "cuda"
+        ):
+            ids_to_check.append(i)
+    return ids_to_check
+def align_inputs_from_check_idxs(
+    model: Callable[[List[torch.Tensor]], Any], inputs_to_check: Sequence[int]
+):
+    if len(inputs_to_check) == 0:
+        return model
+    def run(new_inputs):
+        copy_misaligned_inputs(new_inputs, inputs_to_check)
+        return model(new_inputs)
+    return run
+def align_inputs(
+    model: Callable[[List[torch.Tensor]], Any],
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    inputs_to_check = get_input_idxs_to_check(inputs, static_input_idxs)
+    return align_inputs_from_check_idxs(model, inputs_to_check)
+@dynamo_utils.dynamo_timed
+def cudagraphify(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+    *,
+    device_index: int,
+    stack_traces: List[Optional[str]],
+    is_backward: bool,
+    is_inference: bool,
+    constants: Tuple[torch.Tensor, ...] = (),
+):
+    from torch._inductor.cudagraph_trees import (
+        cudagraphify_impl as new_cudagraphify_impl,
+    )
+    cudagraphify_fn: Callable[..., Any]
+    if config.triton.cudagraph_trees:
+        cudagraphify_fn = functools.partial(
+            new_cudagraphify_impl,
+            device_index=device_index,
+            stack_traces=stack_traces,
+            is_backward=is_backward,
+            is_inference=is_inference,
+            constants=constants,
+        )
+    else:
+        cudagraphify_fn = cudagraphify_impl
+    # if using fake tensors, defer cudagraphs until we get real inputs at runtime
+    if not any(isinstance(inp, FakeTensor) for inp in inputs):
+        return cudagraphify_fn(model, inputs, static_input_idxs)
+    compiled_fn = None
+    def run(new_inputs):
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            with dynamo_utils.preserve_rng_state():
+                compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)
+        return compiled_fn(new_inputs)
+    return run
+def remove_unaligned_input_idxs(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+):
+    """
+    We require all inputs to be aligned, so introduce a copy for any
+    that aren't.
+    """
+    aligned_static_input_idxs = []
+    for idx, input in zip(static_input_idxs, inputs):
+        if isinstance(input, torch.Tensor) and (input.data_ptr() % ALIGNMENT) == 0:
+            aligned_static_input_idxs.append(idx)
+    if len(aligned_static_input_idxs) != len(static_input_idxs):
+        return aligned_static_input_idxs
+    return static_input_idxs
+def static_input(x: torch.Tensor):
+    """
+    Copy and input while preserving strides
+    """
+    # TODO(jansel): figure out why this version doesn't work:
+    # return torch.empty_strided(x.size(), x.stride(), dtype=x.dtype, device=x.device)
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.empty(needed_size, dtype=x.dtype, device=x.device)
+    return torch.as_strided(buffer, x.size(), x.stride())
+def index_expanded_dims_and_copy_(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+    expanded_dims: List[int],
+):
+    "Index into expanded dimensions of both dst and src then copy_"
+    dst = index_expanded_dims(dst, expanded_dims)
+    src = index_expanded_dims(src, expanded_dims)
+    dst.copy_(src)
+def cudagraphify_impl(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    """
+    Assumes inputs[static_input_idxs[i]] are always the same memory address
+    """
+    check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)
+    static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs)
+    copy_misaligned_inputs(inputs, check_input_idxs)
+    assert isinstance(inputs, list)
+    inps_expanded_dims = [
+        get_expanded_dims(x) if idx not in static_input_idxs else []
+        for idx, x in enumerate(inputs)
+    ]
+    # allocate static tensor inputs
+    static_inputs = [
+        x
+        if not isinstance(x, torch.Tensor)
+        else static_input(x)
+        if idx not in static_input_idxs
+        else x.detach()
+        for idx, x in enumerate(inputs)
+    ]
+    # copy over input values for fresh allocations
+    for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)):
+        if isinstance(x, torch.Tensor) and idx not in static_input_idxs:
+            index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims)
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    # copy static_inputs because it will be cleared in model
+    with torch.cuda.stream(stream):
+        model(list(static_inputs))
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream, capture_error_mode="thread_local"):
+        static_outputs = model(list(static_inputs))
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+    if config.size_asserts:
+        def run(new_inputs):
+            assert len(static_inputs) == len(new_inputs)
+            for idx, (dst, src, expanded_dims) in enumerate(
+                zip(static_inputs, new_inputs, inps_expanded_dims)
+            ):
+                if not isinstance(dst, torch.Tensor):
+                    pass
+                elif idx in static_input_idxs:
+                    assert dst.data_ptr() == src.data_ptr()
+                else:
+                    # TODO - could make one single op of multiple slices
+                    # and avoid dispatch.
+                    # Could also pre-index the `dst` tensors
+                    index_expanded_dims_and_copy_(dst, src, expanded_dims)
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+    else:
+        copy_indices = [
+            idx for idx in range(len(static_inputs)) if idx not in static_input_idxs
+        ]
+        def run(new_inputs):
+            for idx in copy_indices:
+                expanded_dims = inps_expanded_dims[idx]
+                index_expanded_dims_and_copy_(
+                    static_inputs[idx], new_inputs[idx], expanded_dims
+                )
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+    return align_inputs_from_check_idxs(run, check_input_idxs)
+def compile_fx_aot(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+):
+    config_patches: Dict[str, Any] = (
+        {"cpp_wrapper": True}
+        if config_patches is None
+        else {**config_patches, "cpp_wrapper": True}
+    )
+    if (
+        "aot_inductor.output_path" not in config_patches
+        and not config.aot_inductor.output_path
+    ):
+        config_patches = {
+            **config_patches,
+            "aot_inductor.output_path": code_hash(model_.code),
+        }
+    extern_node_serializer = config_patches.pop("extern_node_serializer", None)
+    with V.set_aot_compilation(True):
+        compiled_lib_path = compile_fx(
+            model_,
+            example_inputs_,
+            inner_compile=functools.partial(
+                inner_compile,
+                aot_mode=True,
+                extern_node_serializer=extern_node_serializer,
+            ),
+            config_patches=config_patches,
+        )
+        assert os.path.exists(
+            compiled_lib_path
+        ), f"AOTInductor compiled library does not exist at {compiled_lib_path}"
+        return compiled_lib_path
+_graph_counter = count(0)
+def fw_compiler_freezing(
+    aot_autograd_model: torch.fx.GraphModule,
+    aot_example_inputs: List[torch.Tensor],
+    dynamo_model: torch.fx.GraphModule,
+    num_example_inputs: int,
+    inner_compile: Callable[..., Any],
+    cudagraphs: BoxedBool,
+    graph_id: int,
+    forward_device: BoxedDeviceIndex,
+):
+    from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze
+    # partition_fn won't be called
+    _recursive_joint_graph_passes(aot_autograd_model)
+    layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True)
+    if layout_opt:
+        # make sure meta['val'] is properly setup
+        fake_tensor_prop(aot_autograd_model, aot_example_inputs, True)
+        convert_conv_weights_to_channels_last(aot_autograd_model)
+    opt_model, preserved_arg_indices = freeze(
+        dynamo_model,
+        aot_autograd_model,
+        aot_example_inputs,  # type: ignore[arg-type]
+    )
+    aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices]
+    num_fixed = len(preserved_arg_indices) - num_example_inputs
+    fake_mode = detect_fake_mode(aot_example_inputs)
+    # for freezing, all graph outputs should be user visible
+    *_, model_outputs_node = opt_model.graph.nodes
+    model_outputs = model_outputs_node.args[0]
+    user_visible_outputs = [
+        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+    ]
+    # constant params will be real tensors, not fake
+    tracing_context = torch._guards.TracingContext.try_get()
+    if tracing_context is not None:
+        params_flat = tracing_context.params_flat
+        assert params_flat is not None
+        for i in range(len(params_flat)):
+            if i not in preserved_arg_indices:
+                params_flat[i] = None
+    with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
+        optimized_function = inner_compile(
+            opt_model,
+            aot_example_inputs,
+            num_fixed=num_fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=True,
+            boxed_forward_device_index=forward_device,
+            layout_opt=layout_opt,
+            user_visible_outputs=user_visible_outputs,
+        )
+    # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper
+    # that drops constant-ified params
+    if V.aot_compilation is True:
+        return optimized_function
+    def wrapper(args):
+        args_new = [args[i] for i in preserved_arg_indices]
+        args.clear()
+        return optimized_function(args_new)
+    wrapper._boxed_call = True  # type: ignore[attr-defined]
+    return wrapper
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+    decompositions: Optional[Dict[OpOverload, Callable[..., Any]]] = None,
+):
+    """Main entrypoint to a compile given FX graph"""
+    if config_patches:
+        with config.patch(config_patches):
+            return compile_fx(
+                model_,
+                example_inputs_,
+                # need extra layer of patching as backwards is compiled out of scope
+                inner_compile=config.patch(config_patches)(inner_compile),
+                decompositions=decompositions,
+            )
+    if config.cpp_wrapper:
+        with config.patch(
+            {
+                "cpp_wrapper": False,
+                "triton.autotune_cublasLt": False,
+                "triton.cudagraphs": False,
+                "triton.store_cubin": True,
+            }
+        ), V.set_real_inputs(example_inputs_):
+            inputs_ = example_inputs_
+            if isinstance(model_, torch.fx.GraphModule):
+                fake_inputs = [
+                    node.meta.get("val")
+                    for node in model_.graph.nodes
+                    if node.op == "placeholder"
+                ]
+                if all(v is not None for v in fake_inputs):
+                    # Validate devices before switching to fake tensors.
+                    for idx, fi, i in zip(count(), fake_inputs, inputs_):
+                        if fi.device != i.device:
+                            raise ValueError(
+                                f"Device mismatch between fake input and example input at position #{idx}: "
+                                f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                                "make sure torch.export() and torch.aot_compile() run on the same device."
+                            )
+                    inputs_ = fake_inputs
+            return compile_fx(
+                model_,
+                inputs_,
+                inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                decompositions=decompositions,
+            )
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+        inner_compile=inner_compile,
+        decompositions=decompositions,
+    )
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+    if isinstance(model_, torch.fx.GraphModule):
+        if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+            # this graph is the result of dynamo.export()
+            return handle_dynamo_export_graph(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+        model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+        optimus_scuba_log["inductor_pre_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.pre_grad_passes",
+            optimus_scuba_log,
+        )
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        return flatten_graph_inputs(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+    assert not config._raise_error_for_testing
+    num_example_inputs = len(example_inputs_)
+    cudagraphs = BoxedBool(config.triton.cudagraphs)
+    forward_device = BoxedDeviceIndex(None)
+    graph_id = next(_graph_counter)
+    decompositions = (
+        decompositions if decompositions is not None else select_decomp_table()
+    )
+    @dynamo_utils.dynamo_timed
+    def fw_compiler_base(
+        model: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        is_inference: bool,
+    ):
+        if is_inference:
+            # partition_fn won't be called
+            _recursive_joint_graph_passes(model)
+        fixed = torch._inductor.utils.num_fw_fixed_arguments(
+            num_example_inputs, len(example_inputs)
+        )
+        user_visible_outputs = set()
+        if config.keep_output_stride:
+            *_, model_outputs_node = model.graph.nodes
+            assert model_outputs_node.op == "output"
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            num_model_outputs = len(model_outputs)
+            context = torch._guards.TracingContext.try_get()
+            # See Note [User Outputs in the inductor graph]
+            if context is not None and context.fw_metadata and not is_inference:
+                original_output_start_index = (
+                    context.fw_metadata.num_mutated_inp_runtime_indices
+                )
+            else:
+                original_output_start_index = 0
+            if isinstance(model_, torch.fx.GraphModule):
+                *_, orig_model_outputs_node = model_.graph.nodes
+                assert orig_model_outputs_node.op == "output"
+                orig_model_outputs, _ = pytree.tree_flatten(
+                    orig_model_outputs_node.args
+                )
+                num_orig_model_outputs = len(orig_model_outputs)
+            else:
+                num_orig_model_outputs = num_model_outputs
+            assert num_orig_model_outputs <= num_model_outputs
+            # Note [User Outputs in the inductor graph]
+            # We makes the following assumption
+            # For inference
+            #   len(orig_model_outputs) == len(model_outputs)
+            # For training
+            #   len(orig_model_outputs) <= len(model_outputs)
+            # During training, most of the time the model_outputs starts with
+            # original module's outputs followed by saved activations.
+            # But this can be not true if the model have inplace updated tensors.
+            # AOTAutograd will make those tensors being returned before the original
+            # module's output.
+            # To make things safe, we'll use original_output_start_index field
+            # set by AOTAutograd to decide where the original module outputs start.
+            orig_output_end_idx = original_output_start_index + num_orig_model_outputs
+            # Sanity chec: we are about to splice out the "user" outputs from the full set
+            # of "graph" outputs. Make sure we're within bounds.
+            assert orig_output_end_idx <= num_model_outputs
+            user_visible_outputs = {
+                n.name
+                for n in model_outputs[original_output_start_index:orig_output_end_idx]
+                if isinstance(n, torch.fx.Node)
+            }
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=is_inference,
+            boxed_forward_device_index=forward_device,
+            user_visible_outputs=user_visible_outputs,
+        )
+    fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
+    if config.freezing and not torch.is_grad_enabled():
+        inference_compiler = functools.partial(
+            fw_compiler_freezing,
+            dynamo_model=model_,
+            num_example_inputs=num_example_inputs,
+            inner_compile=inner_compile,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            forward_device=forward_device,
+        )
+    else:
+        inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
+    def partition_fn(graph, joint_inputs, **kwargs):
+        _recursive_joint_graph_passes(graph)
+        return min_cut_rematerialization_partition(
+            graph, joint_inputs, **kwargs, compiler="inductor"
+        )
+    @dynamo_utils.dynamo_timed
+    @dynamo_utils.maybe_cprofile
+    def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        fixed = count_tangents(model)
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            is_backward=True,
+            graph_id=graph_id,
+            boxed_forward_device_index=forward_device,
+        )
+    # TODO: can add logging before/after the call to create_aot_dispatcher_function
+    # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+    # once torchdynamo is merged into pytorch
+    fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode(
+        allow_non_fake_inputs=True
+    )
+    tracing_context = (
+        torch._guards.TracingContext.try_get()
+        or torch._guards.TracingContext(fake_mode)
+    )
+    if V.aot_compilation is True:
+        gm, graph_signature = aot_export_module(
+            model_, example_inputs_, trace_joint=False, decompositions=decompositions
+        )
+        unlifted_gm = _unlift_graph(model_, gm, graph_signature)
+        if "dynamo_flat_name_to_original_fqn" in model_.meta:
+            unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[
+                "dynamo_flat_name_to_original_fqn"
+            ]
+        with V.set_fake_mode(fake_mode), compiled_autograd.disable():
+            return inference_compiler(unlifted_gm, example_inputs_)
+    with V.set_fake_mode(fake_mode), torch._guards.tracing(
+        tracing_context
+    ), compiled_autograd.disable():
+        return aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            inference_compiler=inference_compiler,
+            decompositions=decompositions,
+            partition_fn=partition_fn,
+            keep_inference_input_mutations=True,
+        )(model_, example_inputs_)
+def _shape_env_from_inputs(inputs: List[torch.Tensor]):
+    shape_env = None
+    fake_mode = detect_fake_mode(inputs)
+    # TODO(voz): It would be nice to enable this assert, but there are lots of tests that
+    # pass in real inputs for now.
+    # if len(inputs) > 0:
+    # assert fake_mode is not None, breakpoint()
+    if fake_mode is not None:
+        return fake_mode.shape_env
+    # When there are no tensor inputs, get shape_env from the first SymInt.
+    for input in inputs:
+        if isinstance(input, torch.SymInt):
+            return input.node.shape_env
+    # TODO(voz): Should we always have one anyway?
+    return None
+def graph_returns_tuple(gm: torch.fx.GraphModule):
+    """True if a FX graph returns a tuple"""
+    if not isinstance(gm, torch.fx.GraphModule):
+        return True  # can't check this, assume true
+    (rv,) = output_node(gm).args
+    if isinstance(rv, (list, tuple)):
+        return True
+    if (
+        isinstance(rv, torch.fx.node.Node)
+        and hasattr(rv.target, "_schema")
+        and len(rv.target._schema.returns) > 1
+        and all(str(ret.type) == "Tensor" for ret in rv.target._schema.returns)
+    ):
+        # for graphs whose result is one node with multiple outputs
+        return True
+    return False
+def make_graph_return_tuple(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    Mutate gm so it returns a tuple.  This is only needed for graphs
+    not created by torchdynamo that return non-tuples.
+    """
+    node = output_node(gm)
+    (rv,) = node.args
+    rv, spec = pytree.tree_flatten(rv)
+    with gm.graph.inserting_before(node):
+        gm.graph.output(rv)
+    gm.graph.erase_node(node)
+    assert graph_returns_tuple(gm)
+    compiled_fn = compile_gm(gm, inputs)
+    @functools.wraps(compiled_fn)
+    def wrapper(*args, **kwargs):
+        return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec)
+    return wrapper
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate inputs so that they are flat and wrap gm such that it
+    accepts those inputs.  This is only needed for graphs not created
+    by torchdynamo that take bumpy inputs.
+    """
+    inputs, spec = pytree.tree_flatten(inputs)
+    class GmWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gm = gm
+        def forward(self, *args):
+            args: List[Any] = list(args)
+            return self.gm(*pytree.tree_unflatten(args, spec))
+    compiled_fn = compile_gm(GmWrapper(), inputs)
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        # note this doesn't check the spec, assuming it is the same
+        return compiled_fn(*pytree.arg_tree_leaves(*args))
+    return wrapper
+def handle_dynamo_export_graph(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
+    convert that to a normal FX graph so inductor can compile it.
+    """
+    codegen = gm.graph._codegen
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.recompile()
+    compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs))
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
+    return wrapper

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import dataclasses
+from typing import Dict, Iterable, Optional
+import torch
+from torch._inductor.codecache import CompiledFxGraph
+def get_mutating_use_stack_trace(placeholder_node: torch.fx.Node) -> Optional[str]:
+    # reinplaced uses might have a single, non-copy_ use
+    if len(placeholder_node.users) == 1:
+        return next(iter(placeholder_node.users)).meta.get("stack_trace", None)
+    for use in placeholder_node.users:
+        if use.target == torch.ops.aten.copy_.default:
+            if stack_trace := use.meta.get("stack_trace", None):
+                return stack_trace
+    return None
+def format_default_skip_message(reason: str) -> str:
+    return f"skipping cudagraphs due to {reason}"
+def get_mutation_stack_trace(
+    gm: torch.fx.GraphModule, mutation_indices: Iterable[int]
+) -> str:
+    stack_trace: Optional[str] = ""
+    placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    for idx in mutation_indices:
+        placeholder = placeholders[idx]
+        if stack_trace := get_mutating_use_stack_trace(placeholder):
+            break
+    if stack_trace:
+        msg = f"skipping cudagraphs due to mutation on input. Found from : \n {stack_trace}"
+        return msg
+    return format_default_skip_message("mutated inputs")
+def check_for_mutation(
+    gm: torch.fx.GraphModule, compiled_graph: CompiledFxGraph, num_fixed: int
+) -> Optional[str]:
+    default_msg = format_default_skip_message("mutated inputs")
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx for idx in compiled_graph.mutated_input_idxs if idx >= num_fixed
+        ]
+        has_mutation = len(mutation_indices) != 0
+        if not has_mutation:
+            return None
+        return get_mutation_stack_trace(gm, mutation_indices)
+    else:
+        has_mutation = len(compiled_graph.mutated_inputs) != 0
+        return None if not has_mutation else default_msg
+def get_use_stack_trace(node) -> Optional[str]:
+    for use in node.users:
+        if stack_trace := use.meta.get("stack_trace", None):
+            return stack_trace
+    return None
+def check_multiple_devices_or_any_cpu_nodes(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+) -> Optional[str]:
+    if cpu_node := device_node_mapping.get(torch.device("cpu")):
+        if stack_trace := get_use_stack_trace(cpu_node):
+            return format_default_skip_message(
+                f"cpu device. Found from : \n {stack_trace}"
+            )
+        return format_default_skip_message("cpu device")
+    if (
+        len(device_node_mapping) == 1
+        and next(iter(device_node_mapping.keys())).type == "cuda"
+    ):
+        return None
+    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
+def check_lowering_disable_cudagraph(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+):
+    return check_multiple_devices_or_any_cpu_nodes(device_node_mapping)
+@dataclasses.dataclass
+class BoxedDeviceIndex:
+    value: Optional[int]
+    def set(self, device_idx: Optional[int]):
+        assert device_idx is None or isinstance(device_idx, int)
+        self.value = device_idx

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import collections
+import dataclasses
+import itertools
+import logging
+import re
+import typing
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
+import sympy
+import torch
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from .codegen.common import index_prevent_reordering
+from .utils import (
+    get_dtype_size,
+    reduction_num_outputs,
+    sympy_index_symbol,
+    sympy_str,
+    sympy_subs,
+    VarRanges,
+)
+from .virtualized import OpsHandler, ReductionType, V
+log = logging.getLogger(__name__)
+is_indirect = re.compile(r"indirect|tmp").search
+Dep = Union["MemoryDep", "StarDep", "WeakDep"]
+class MemoryDep(typing.NamedTuple):
+    name: str
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+    def __repr__(self):
+        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})"
+    @property
+    def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]:
+        """{c0: 128, c1: 512, ...}"""
+        return dict(zip(self.var_names, self.size))
+    def get_numel(self) -> sympy.Expr:
+        if self.is_indirect():
+            numel = V.graph.get_numel(self.name)
+        else:
+            vars = set(self.index.free_symbols)
+            numel = sympy.Integer(1)
+            for var, size in zip(self.var_names, self.size):
+                if var in vars:
+                    numel = numel * size
+        return numel
+    def rename(self, renames: Dict[str, str]) -> "MemoryDep":
+        if self.name in renames:
+            return MemoryDep(
+                renames[self.name], self.index, var_names=self.var_names, size=self.size
+            )
+        return self
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+    def is_contiguous(self) -> bool:
+        return isinstance(self.index, sympy.Symbol) and self.index in self.var_names
+    def is_scalar(self) -> bool:
+        if isinstance(self.index, sympy.Symbol):
+            return self.index not in self.var_names and not self.is_indirect()
+        return isinstance(self.index, (int, sympy.Integer))
+    def is_indirect(self) -> bool:
+        return any(is_indirect(v.name) for v in self.index.free_symbols)  # type: ignore[attr-defined]
+class StarDep(typing.NamedTuple):
+    # depends on the entire buffer
+    name: str
+    @property
+    def index(self):
+        raise NotImplementedError("StarDep does not have an index")
+    def get_numel(self) -> sympy.Expr:
+        return V.graph.get_numel(self.name)
+    def rename(self, renames: Dict[str, str]) -> "StarDep":
+        if self.name in renames:
+            return StarDep(renames[self.name])
+        return self
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+    def is_contiguous(self) -> bool:
+        return False
+    def is_scalar(self) -> bool:
+        return False
+    def is_indirect(self) -> bool:
+        return False
+# Used for tracking mutation ordering
+# if A reads a buffer and B mutates it
+# B must be ordered after A
+#
+# It is weak because if it turns out A's read is never used, we can still
+# eliminate it
+class WeakDep(typing.NamedTuple):
+    name: str
+    @property
+    def index(self):
+        raise NotImplementedError("WeakDep does not have an index")
+    def get_numel(self) -> sympy.Expr:
+        return sympy.Integer(1)
+    def rename(self, renames: Dict[str, str]) -> "WeakDep":
+        if self.name in renames:
+            return WeakDep(renames[self.name])
+        return self
+    def numbytes_hint(self):
+        return 1  # Purely inserted for ordering, not an actual dep
+    def has_unbacked_symbols(self):
+        return False
+    def is_contiguous(self) -> bool:
+        return False
+class IndexExprDep(typing.NamedTuple):
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+@dataclasses.dataclass
+class ReadWrites:
+    reads: Set[Dep]
+    writes: Set[Dep]
+    index_exprs: Set[IndexExprDep]
+    range_vars: Optional[List[sympy.Expr]] = None
+    var_ranges: Optional[VarRanges] = None
+    op_counts: typing.Counter[str] = dataclasses.field(
+        default_factory=collections.Counter
+    )
+    def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites":
+        return ReadWrites(
+            {dep.rename(renames) for dep in self.reads},
+            {dep.rename(renames) for dep in self.writes},
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+    def with_read(self, dep: Dep) -> "ReadWrites":
+        assert isinstance(dep, (WeakDep, StarDep))
+        return ReadWrites(
+            set.union(self.reads, {dep}),
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+    def merge(self, other: "ReadWrites"):
+        reads = set.union(self.reads, other.reads)
+        writes = set.union(self.writes, other.writes)
+        index_exprs = set.union(self.index_exprs, other.index_exprs)
+        op_counts = collections.Counter(self.op_counts)
+        op_counts.update(other.op_counts)
+        return ReadWrites(reads - writes, writes, index_exprs, op_counts=op_counts)
+    @staticmethod
+    def merge_list(read_writes: List["ReadWrites"]):
+        all_writes = set.union(*[rw.writes for rw in read_writes])
+        all_reads = set.union(*[rw.reads for rw in read_writes]) - all_writes
+        all_index_exprs = set.union(*[rw.index_exprs for rw in read_writes])
+        op_counts: typing.Counter[Any] = collections.Counter()
+        for rw in read_writes:
+            op_counts.update(rw.op_counts)
+        return ReadWrites(all_reads, all_writes, all_index_exprs, op_counts=op_counts)
+    def remove_reads(self, rem_reads):
+        return ReadWrites(
+            self.reads - rem_reads,
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+    def reads_and_writes(self):
+        return itertools.chain(self.reads, self.writes)
+class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        super().__init__()
+        self._reads: Set[Dep] = set()
+        self._writes: Set[MemoryDep] = set()
+        self._index_exprs: Set[IndexExprDep] = set()
+        self._var_ranges: VarRanges = var_ranges
+        self._normalize: bool = normalize
+    def canonicalize(
+        self, index: sympy.Expr
+    ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]:
+        if not self._normalize:
+            sizes = [V.graph.sizevars.simplify(x) for x in self._var_ranges.values()]
+            var_names = tuple(
+                k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1
+            )
+            sizes = tuple(v for v in sizes if v != 1)
+            return index, var_names, sizes  # type: ignore[return-value]
+        # Try to further simplify the indexes even if simplify_loops didn't
+        # convert it to the simplest form because of the interference from
+        # different indexing formulas.
+        free_symbols = index.free_symbols
+        var_ranges = {
+            k: V.graph.sizevars.simplify(v)
+            for k, v in self._var_ranges.items()
+            # TODO(jansel): explore this further normalization
+            # if k in free_symbols
+        }
+        index_vars = [*var_ranges.keys()]
+        sizes = tuple(var_ranges.values())
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars,
+            sizes,
+            index_prevent_reordering([index], index_vars, sizes),
+        )
+        # assign new variables each dimension to deal with numbering mismatches
+        # d0, d1, d2 could become d0, d2 -- which won't match d0, d1
+        new_vars, add_var = var_builder(canonicalization_prefix())
+        replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
+        index = sympy_subs(sympy.expand(index), replacement)
+        new_vars = [*new_vars.keys()]
+        new_sizes = [*new_sizes]
+        free_symbols = index.free_symbols
+        while new_vars and new_vars[-1] not in free_symbols:
+            # Reduction has last (reduced) dim in its sizes, but
+            # downstream users won't.  Normalize this away.
+            new_vars.pop()
+            new_sizes.pop()
+        return index, tuple(new_vars), tuple(new_sizes)  # type: ignore[arg-type]
+    def load(self, name: str, index: sympy.Expr) -> str:
+        self._reads.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"load({name}, {sympy_str(index)})"
+    def load_seed(self, name: str, index: int):
+        assert isinstance(index, int)
+        return self.load(name, sympy.Integer(index))
+    def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str:
+        self._writes.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
+    def store_reduction(self, name: str, index, value) -> str:
+        return self.store(name, index, f"store_reduction({value})")
+    def index_expr(self, index: sympy.Expr, dtype) -> str:
+        self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+        return f"index_expr({sympy_str(index)}, {dtype})"
+    def bucketize(
+        self,
+        values,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ):
+        self._reads.add(StarDep(offsets_name))
+        return f"bucketize({values}, {offsets_name}, {sympy_str(offsets_size)}, {indexing_dtype}, {right})"
+class _OpCounter:
+    """Shim to count how many times each op is used"""
+    def __init__(self, inner):
+        super().__init__()
+        self.parent_handler = inner
+        self._op_counts: typing.Counter[Any] = collections.Counter()
+    def __getattr__(self, name):
+        self._op_counts[name] += 1
+        return getattr(self.parent_handler, name)
+class RecordLoadStore(V.KernelFormatterHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        parent_handler = _RecordLoadStoreInner(
+            var_ranges=var_ranges, normalize=normalize
+        )
+        parent_handler = _OpCounter(parent_handler)
+        super().__init__(parent_handler=parent_handler)
+def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
+    cnt = itertools.count()
+    var_ranges: VarRanges = dict()
+    def add_var(length: sympy.Expr) -> sympy.Symbol:
+        v = sympy_index_symbol(f"{prefix}{next(cnt)}")
+        var_ranges[v] = length
+        return v
+    return var_ranges, add_var
+def index_vars_no_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str):
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Symbol]] = []
+    for size in argsizes:
+        args.append(list(map(add_var, size)))
+    return args, var_ranges
+def index_vars_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str = "d"):
+    from .ir import SqueezeView
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Expr]] = []
+    new_sizes: List[List[sympy.Expr]] = []
+    for size in argsizes:
+        new_size, reindex = SqueezeView.squeezer(size)
+        new_sizes.append(new_size)
+        args.append(reindex(list(map(add_var, new_size))))
+    return args, var_ranges
+def extract_read_writes(
+    fn: Callable[..., Any],
+    *argsizes: Tuple[sympy.Expr, ...],
+    normalize: bool = False,
+    prefix: str = "d",
+):
+    args, var_ranges = index_vars_squeeze(*argsizes, prefix=prefix)
+    rw = RecordLoadStore(var_ranges, normalize=normalize)
+    with V.set_ops_handler(rw):
+        fn(*args)
+    if normalize:
+        range_vars = []  # Number of vars could differ due to normalization
+    else:
+        range_vars = list(itertools.chain.from_iterable(args))
+    inner = rw.parent_handler.parent_handler
+    return ReadWrites(
+        set(inner._reads),
+        set(inner._writes),
+        inner._index_exprs,
+        range_vars,
+        var_ranges,
+        rw.parent_handler._op_counts,
+    )
+def extract_input_node_reduction_ranges(
+    input_node: "torch._inductor.ir.TensorBox",
+) -> Tuple[Optional[List[sympy.Expr]], Optional[List[sympy.Expr]]]:
+    """
+    Returns the size and reduction size of all inputs, if the sizes and reduction_sizes (if exist) are all the same.
+    It's possible that a node has multiple inputs, some are Reduction nodes and others are Pointwise nodes.
+    In this case, reduction_sizes of the Reduction nodes need to be the same.
+    Otherwise returns (None, None).
+    """
+    from .ir import ComputedBuffer, Loops
+    if isinstance(input_node.data, ComputedBuffer):
+        # Input node has already been realized. Return its size and reduction_size.
+        size = input_node.get_size()
+        reduction_size = input_node.get_reduction_size()
+        if len(reduction_size) > 0:
+            return (size, reduction_size)
+        else:
+            return (None, None)
+    if not isinstance(input_node.data.data, Loops):  # type: ignore[attr-defined]
+        # Other IRNodes do not have reduction_ranges.
+        return (None, None)
+    # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes?
+    # The current method still uses reduction ranges from the dependent realized node, which is not ideal.
+    # Is there a way to check whether there are permutations inbetween?
+    reads = input_node.get_reads()
+    reduction_size = None
+    size = None
+    while reduction_size is None and len(reads) > 0:
+        seen = set()
+        new_reads = []
+        for read in reads:
+            if not isinstance(read, MemoryDep):
+                continue
+            if read.name in seen:
+                continue
+            seen.add(read.name)
+            buffer = V.graph.get_buffer(read.name)
+            if buffer is None:
+                continue
+            if (
+                isinstance(buffer, ComputedBuffer)
+                and len(buffer.get_reduction_size()) > 0
+            ):
+                if reduction_size is None:
+                    reduction_size = buffer.get_reduction_size()
+                    size = buffer.get_size()
+                elif (
+                    reduction_size != buffer.get_reduction_size()
+                    or size != buffer.get_size()
+                ):
+                    return (None, None)
+            else:
+                new_reads.extend(buffer.get_reads())
+        if reads == new_reads:
+            return (size, reduction_size)
+        else:
+            reads = new_reads
+    return (size, reduction_size)
+def canonicalization_prefix():
+    return "c"
+# ops handler which computes all the free unbacked symbols for an IR
+class FreeUnbackedSymbolsOpsHandler:
+    symbols: Set[sympy.Symbol]
+    def __init__(self):
+        self.symbols = set()
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            for a in itertools.chain(args, kwargs.values()):
+                if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+                    self.symbols |= free_unbacked_symbols(a)
+        return inner
+    def indirect_indexing(self, index_var, size, check=True) -> sympy.Symbol:
+        assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+        self.symbols |= free_unbacked_symbols(size)
+        return sympy_index_symbol(f"({str(index_var)})")
+    def frexp(self, x):
+        return (None,) * 2
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[None, Tuple[None, ...]],
+    ) -> Union[None, Tuple[None, ...]]:
+        num_values = reduction_num_outputs(reduction_type)
+        return (None,) * num_values if num_values > 1 else None
+def _typecheck_FreeUnbackedSymbolsOpsHandler(
+    h: FreeUnbackedSymbolsOpsHandler,
+) -> OpsHandler[None]:
+    return h
+def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None):
+    from .ir import FlexibleLayout
+    args = [index, rindex] if rindex is not None else [index]
+    handler = FreeUnbackedSymbolsOpsHandler()
+    # NB: I cargo culted the allow_indexing patch here, I don't understand why
+    # people do this all over
+    with V.set_ops_handler(handler), patch.object(
+        FlexibleLayout, "allow_indexing", True
+    ):
+        fn(*args)
+    return handler.symbols

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from __future__ import annotations
+import os
+import tempfile
+import textwrap
+from functools import lru_cache
+if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1":
+    @lru_cache(None)
+    def _record_missing_op(target):
+        with open(f"{tempfile.gettempdir()}/missing_ops.txt", "a") as fd:
+            fd.write(str(target) + "\n")
+else:
+    def _record_missing_op(target):  # type: ignore[misc]
+        pass
+class OperatorIssue(RuntimeError):
+    @staticmethod
+    def operator_str(target, args, kwargs):
+        lines = [f"target: {target}"] + [
+            f"args[{i}]: {arg}" for i, arg in enumerate(args)
+        ]
+        if kwargs:
+            lines.append(f"kwargs: {kwargs}")
+        return textwrap.indent("\n".join(lines), "  ")
+class MissingOperatorWithoutDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(f"missing lowering\n{self.operator_str(target, args, kwargs)}")
+class MissingOperatorWithDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(
+            f"missing decomposition\n{self.operator_str(target, args, kwargs)}"
+            + textwrap.dedent(
+                f"""
+                There is a decomposition available for {target} in
+                torch._decomp.get_decompositions().  Please add this operator to the
+                `decompositions` list in torch._inductor.decompositions
+                """
+            )
+        )
+class LoweringException(OperatorIssue):
+    def __init__(self, exc: Exception, target, args, kwargs):
+        super().__init__(
+            f"{type(exc).__name__}: {exc}\n{self.operator_str(target, args, kwargs)}"
+        )
+class InvalidCxxCompiler(RuntimeError):
+    def __init__(self):
+        from . import config
+        super().__init__(
+            f"No working C++ compiler found in {config.__name__}.cpp.cxx: {config.cpp.cxx}"
+        )
+class CppWrapperCodeGenError(RuntimeError):
+    def __init__(self, msg: str):
+        super().__init__(f"C++ wrapper codegen error: {msg}")
+class CppCompileError(RuntimeError):
+    def __init__(self, cmd: list[str], output: str):
+        if isinstance(output, bytes):
+            output = output.decode("utf-8")
+        super().__init__(
+            textwrap.dedent(
+                """
+                    C++ compile error
+                    Command:
+                    {cmd}
+                    Output:
+                    {output}
+                """
+            )
+            .strip()
+            .format(cmd=" ".join(cmd), output=output)
+        )
+class CUDACompileError(CppCompileError):
+    pass

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import operator
+from collections import defaultdict
+from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type
+import torch
+import torch.fx
+from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map
+from .virtualized import V
+# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched.
+# Works for length 2 patterns with 1 module and 1 function/method.
+def matches_module_function_pattern(
+    pattern: Tuple[Type[torch.nn.modules.Module], Callable[..., Any]],
+    node: torch.fx.node.Node,
+    modules: Dict[str, torch.nn.modules.Module],
+) -> bool:
+    if len(node.args) == 0:
+        return False
+    if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+        node, torch.fx.Node
+    ):
+        return False
+    # the first node is call_module
+    if node.args[0].op != "call_module":
+        return False
+    if not isinstance(node.args[0].target, str):
+        return False
+    if node.args[0].target not in modules:
+        return False
+    if type(modules[node.args[0].target]) is not pattern[0]:
+        return False
+    # the second node is call_function or call_method
+    if node.op != "call_function" and node.op != "call_method":
+        return False
+    if node.target != pattern[1]:
+        return False
+    # make sure node.args[0] output is only used by current node.
+    if len(node.args[0].users) > 1:
+        return False
+    return True
+class FakeTensorUpdater:
+    """
+    The main idea here is that it's difficult to maintain accurate fake
+    tensors (our primary form of metadata) for each node in our graph as we
+    transform it.
+    The most reliable way to obtain this information is by rerunning
+    faketensor propagation. However, in general, faketensor propagation is
+    fairly expensive. So, instead we'd like to only rerun faketensor
+    propagation on nodes that have changed.
+    In order to detect which nodes have changed, we first hash its node,
+    target, and argument lists (which are immutable in FX).
+    Then, whenever we call incremental_update, we check which FX nodes have a
+    new hash, and recompute the faketensor metadata for that node. Then, we
+    continue to recursively compute the faketensors for all users until the
+    fake tensors stop changing.
+    """
+    def __init__(self, graph: torch.fx.Graph):
+        self.processed_hashes = set()
+        self.graph = graph
+        for node in self.graph.nodes:
+            self.processed_hashes.add(self.hash_node(node))
+    def hash_node(self, node: torch.fx.Node):
+        # todo(chilli): Not a great hash function
+        return (node, node.target, id(node.args), id(node.kwargs))
+    def incremental_update(self):
+        processed = set()
+        existing_storages: DefaultDict[Optional[int], int] = defaultdict(int)
+        for node in self.graph.nodes:
+            existing_storages[get_node_storage(node)] += 1
+        def is_intlist_same(new, old):
+            return statically_known_true(sym_eq(new, old))
+        def is_fake_tensor_same(new, old):
+            if type(new) != type(old):
+                return False
+            if isinstance(new, (list, tuple)):
+                if len(new) != len(old):
+                    return False
+                return all(
+                    is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
+                )
+            assert isinstance(new, torch.Tensor)
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
+                return False
+            if get_storage(new) == get_storage(old):
+                return True
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+            ):
+                return True
+            return False
+        for node in self.graph.nodes:
+            if self.hash_node(node) in self.processed_hashes:
+                continue
+            def is_aten_node(node):
+                return node.op == "call_function" and isinstance(
+                    node.target, torch._ops.OpOverload
+                )
+            if not is_aten_node(node):
+                continue
+            processing = [node]
+            while len(processing) > 0:
+                updating_node = processing.pop()
+                if updating_node in processed:
+                    continue
+                if is_aten_node(updating_node):
+                    continue
+                is_valid, args, kwargs = get_fake_args_kwargs(updating_node)
+                if not is_valid:
+                    continue
+                with V.fake_mode:
+                    new_fake_tensor = updating_node.target(*args, **kwargs)
+                if "val" in updating_node.meta and is_fake_tensor_same(
+                    new_fake_tensor, updating_node.meta["val"]
+                ):
+                    continue
+                updating_node.meta["val"] = new_fake_tensor
+                # todo(chilli): This code path is not exercised by our existing
+                # tests - add a test
+                existing_storages[get_node_storage(new_fake_tensor)] += 1
+                processed.add(updating_node)
+                processing.extend(updating_node.users)
+                self.processed_hashes.add(self.hash_node(updating_node))
+def get_storage(t: torch.Tensor) -> int:
+    return t.untyped_storage()._cdata
+def get_node_storage(node: torch.fx.Node) -> Optional[int]:
+    if "val" not in node.meta:
+        return None
+    if not isinstance(node.meta["val"], torch.Tensor):
+        return None
+    if not torch._C._has_storage(node.meta["val"]):
+        return None
+    return get_storage(node.meta["val"])
+def get_fake(x):
+    if isinstance(x, torch.fx.Node):
+        if "val" not in x.meta:
+            return x
+        return x.meta["val"]
+    return x
+def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str, Any]]:
+    """
+    First value returns a boolean if any of the input nodes don't have a faketensor.
+    """
+    args, kwargs = tree_map(get_fake, (x.args, x.kwargs))
+    if any(
+        isinstance(a, torch.fx.Node) for a in pytree.arg_tree_leaves(*args, **kwargs)
+    ):
+        return False, args, kwargs
+    return True, args, kwargs
+def is_node_realized(node: torch.fx.Node) -> bool:
+    """Returns true if a node is always realized when lowered to inductor IR.
+    NOTE: This may return some false negatives. e.g. it doesn't
+    handle buffers realized heuristically during lowering, or
+    buffers realized indirectly through view ops.
+    """
+    from torch._inductor.lowering import fallbacks, needs_realized_inputs
+    def is_buffer(node: torch.fx.Node) -> bool:
+        if node.op == "call_function" and node.target is operator.getitem:
+            # For nodes with multiple outputs, we get the fx graph:
+            #     foo = torch.ops.aten.foo(...)
+            #     getitem = foo[0]
+            #     getitem_1 = foo[1]
+            # where we need to check if foo is a fallback kernel
+            return is_buffer(node.args[0])  # type: ignore[arg-type]
+        return node.op in ("placeholder", "output") or node.target in fallbacks
+    if is_buffer(node):
+        return True
+    def realizes_inputs(node: torch.fx.Node) -> bool:
+        return node.op == "output" or node.target in needs_realized_inputs
+    if any(realizes_inputs(user) for user in node.users):
+        return True
+    # Otherwise, assume node isn't realized
+    return False

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""This file implements the IndexPropagation ops handler, which wraps an
+underlying handler to add a limited form of constant propagation, as well as
+propagation of sympy expressions downstream of ops.index_expr calls.
+For example, say we have the IR:
+   tmp0 = ops.index_expr(x, torch.int32)
+   tmp1 = ops.constant(2, torch.int32)
+   tmp2 = ops.mul(tmp0, tmp1)
+   tmp3 = ops.indirect_indexing(tmp2, x_size)
+   tmp4 = ops.load("buf0", tmp3)
+The underlying handler would just see:
+   ops.load("buf0", x * 2)
+This is limited by the set of operators handled in the sympy expression
+printers. So simple operations like minimum and maximum cannot be translated to
+SymPy expressions yet, despite sympy.Min and sympy.Max existing.
+"""
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Literal, Optional, overload, Tuple, Union
+import sympy
+from typing_extensions import TypeAlias
+import torch
+from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where
+@dataclass
+class TypedExpr:
+    """A SymPy expression with associated type"""
+    expr: sympy.Expr
+    dtype: torch.dtype
+class SymPyOps:
+    """An ops handler where all IR values are SymPy expressions
+    When a value cannot be represented as a SymPy expression, the method is
+    either not defined, or returns NotImplemented
+    """
+    @staticmethod
+    def identity(value: Any) -> Any:
+        return value
+    @staticmethod
+    def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr:
+        if is_boolean_dtype(dtype):
+            expr = sympy.Integer(bool(value))
+        elif is_integer_dtype(dtype):
+            expr = sympy.Integer(int(value))
+        else:
+            expr = sympy.Float(float(value))
+        return TypedExpr(expr, dtype)
+    @staticmethod
+    def index_expr(value: sympy.Expr, dtype: torch.dtype) -> Union[int, TypedExpr]:
+        if isinstance(value, int):
+            value = sympy.Integer(value)
+        return TypedExpr(value, dtype)
+    @staticmethod
+    def to_dtype(
+        value: Any, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> Union[int, TypedExpr]:
+        if isinstance(value.expr, (sympy.Integer, sympy.Float)):
+            return SymPyOps.constant(value.expr, dtype)
+        elif is_integer_dtype(dtype) and is_integer_dtype(value.dtype):
+            return SymPyOps.index_expr(value.expr, dtype)
+        else:
+            # TODO: Inductor doesn't handle floating point in sympy expressions well at the moment
+            return NotImplemented
+    @staticmethod
+    def square(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(x.expr * x.expr, x.dtype)
+    @staticmethod
+    def add(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr + y.expr, result_type)
+    @staticmethod
+    def sub(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr - y.expr, result_type)
+    @staticmethod
+    def mul(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr * y.expr, result_type)
+    @staticmethod
+    def neg(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(-x.expr, x.dtype)
+    @staticmethod
+    def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        return TypedExpr(FloorDiv(x.expr, y.expr), result_type)
+    @staticmethod
+    def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+        return TypedExpr(result_expr, result_type)
+    @staticmethod
+    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        # In these cases, remainder in Python == remainder in C++, so this transformation
+        # is sound
+        if (
+            x.expr.is_nonnegative is not None
+            and x.expr.is_nonnegative == y.expr.is_positive
+        ):
+            result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+            return TypedExpr(result_expr, result_type)
+        return NotImplemented
+    @staticmethod
+    def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Min(x.expr, y.expr), result_type)
+    @staticmethod
+    def maximum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Max(x.expr, y.expr), result_type)
+@dataclass
+class IndexPropVar:
+    value: Any  # Either an IR value, or TypedExpr if is_symbolic is true
+    is_symbolic: bool = False
+    @staticmethod
+    def new_symbolic(expr: TypedExpr) -> "IndexPropVar":
+        return IndexPropVar(expr, is_symbolic=True)
+    def __post_init__(self):
+        assert not self.is_symbolic or isinstance(
+            self.value, TypedExpr
+        ), "Symbolic IndexPropVar must contain a TypedExpr"
+IndexPropResult: TypeAlias = Union[IndexPropVar, Tuple["IndexPropResult", ...]]
+class IndexPropagation:
+    """Ops wrapper that tries to propagate constant and index_expr values through the computation.
+    This aims to maximize the compile time simplification possible, and convert
+    indirect indexing from arange into normal static indexing.
+    """
+    def __init__(self, inner: Any):
+        self._inner = inner
+    def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any:
+        # Construct a new constant/index_expr from the SymPy expression
+        if isinstance(expr, sympy.Integer):
+            return self._inner.constant(int(expr), dtype)
+        elif expr.is_number:
+            return self._inner.constant(float(expr), dtype)
+        return self._inner.index_expr(expr, dtype)
+    def unwrap(self, a: Union[Any, IndexPropVar]) -> Any:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.unwrap(v) for v in a)
+        if not isinstance(a, IndexPropVar):
+            return a
+        # Prefer the sympy representation if possible
+        if a.is_symbolic:
+            return self.materialize_expr(a.value.expr, a.value.dtype)
+        return a.value
+    def wrap(self, a) -> IndexPropResult:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.wrap(v) for v in a)
+        return IndexPropVar(a)
+    @overload
+    def fallback(
+        self,
+        name: Literal["indirect_indexing"],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> IndexPropVar:
+        ...
+    @overload
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        ...
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Fallback to the wrapped handler
+        new_args = [self.unwrap(a) for a in args]
+        new_kwargs = {k: self.unwrap(v) for k, v in kwargs.items()}
+        return self.wrap(getattr(self._inner, name)(*new_args, **new_kwargs))
+    def propagate_sympy(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Build a new SymPy expression from this ops call
+        def unwrap(a: Union[Any, IndexPropVar]) -> Any:
+            if not isinstance(a, IndexPropVar):
+                return a
+            return a.value
+        new_args = [unwrap(a) for a in args]
+        new_kwargs = {k: unwrap(v) for k, v in kwargs.items()}
+        new_expr = getattr(SymPyOps, name)(*new_args, **new_kwargs)
+        is_valid_expr = new_expr is not NotImplemented and (
+            # Inductor doesn't expect floating point in sympy expressions, but
+            # allow floating point constants to be propagated
+            isinstance(new_expr.expr, sympy.Number)
+            or new_expr.expr.is_integer
+        )
+        if not is_valid_expr:
+            return self.fallback(name, args, kwargs)
+        return IndexPropVar.new_symbolic(new_expr)
+    def __getattr__(self, name: str) -> Callable[..., IndexPropResult]:
+        def inner(*args: Any, **kwargs: Any) -> IndexPropResult:
+            if not hasattr(SymPyOps, name):
+                return self.fallback(name, args, kwargs)
+            var_arguments = [
+                a
+                for a in itertools.chain(args, kwargs.values())
+                if isinstance(a, IndexPropVar)
+            ]
+            if not all(v.is_symbolic for v in var_arguments):
+                return self.fallback(name, args, kwargs)
+            return self.propagate_sympy(name, args, kwargs)
+        return inner
+    def indirect_indexing(
+        self, index: Union[Any, IndexPropVar], size: Any, check: bool = True
+    ) -> Any:
+        # nb. We do index + Where(...) rather than Where(idx >= 0, idx, idx + sz) because we don't have CSE
+        #     for SymPy expressions, so we don't want to repeat idx too much
+        # indirect_indexing returns a sympy value, so no need to wrap in IndexPropVar here
+        if isinstance(index, IndexPropVar) and index.is_symbolic:
+            # If we are turning a indirect indexing into direct, we need to wrap it.
+            index = index.value.expr
+            return index + Where(index >= 0, 0, size)
+        return self.fallback("indirect_indexing", (index, size, check), {}).value

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py ADDED Viewed

	@@ -0,0 +1,419 @@

+from __future__ import annotations
+import csv
+import inspect
+import os
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Set, Tuple, TYPE_CHECKING, Union
+from torch._inductor import config
+from torch._inductor.utils import get_benchmark_name
+# Prevent circular import
+if TYPE_CHECKING:
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        ExternKernelSchedulerNode,
+        NopKernelSchedulerNode,
+        SchedulerNode,
+    )
+# counter for tracking how many kernels have been generated
+generated_kernel_count = 0
+generated_cpp_vec_kernel_count = 0
+num_bytes_accessed = 0
+nodes_num_elem: List[
+    Tuple[
+        Union[NopKernelSchedulerNode, SchedulerNode, ExternKernelSchedulerNode],
+        int,
+    ]
+] = []
+node_runtimes: List[Tuple[BaseSchedulerNode, float]] = []
+# counters for tracking fusions
+ir_nodes_pre_fusion = 0
+# counters for tracking to_dtype inserted
+cpp_to_dtype_count = 0
+# counters for tracking cpp_wrapper disabled
+disable_cpp_wrapper = 0
+# reset all counters
+def reset():
+    global generated_kernel_count
+    global generated_cpp_vec_kernel_count
+    global num_bytes_accessed, nodes_num_elem
+    global ir_nodes_pre_fusion
+    global cpp_to_dtype_count
+    global disable_cpp_wrapper
+    generated_kernel_count = 0
+    generated_cpp_vec_kernel_count = 0
+    num_bytes_accessed = 0
+    nodes_num_elem.clear()
+    node_runtimes.clear()
+    ir_nodes_pre_fusion = 0
+    cpp_to_dtype_count = 0
+    disable_cpp_wrapper = 0
+@dataclass
+class CachedMetricsDeltas:
+    """
+    The subset of metrics we want update across cache hits, e.g., the
+    FxGraphCache.
+    """
+    generated_kernel_count: int
+    generated_cpp_vec_kernel_count: int
+    ir_nodes_pre_fusion: int
+    cpp_to_dtype_count: int
+class CachedMetricsHelper:
+    """
+    A helper class to help calculate and apply counter deltas for those
+    metrics we want to save with cache entries (e.g., FxGraphCache) and
+    apply on a cache hit.
+    """
+    def __init__(self):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+        self.generated_kernel_count = generated_kernel_count
+        self.generated_cpp_vec_kernel_count = generated_cpp_vec_kernel_count
+        self.ir_nodes_pre_fusion = ir_nodes_pre_fusion
+        self.cpp_to_dtype_count = cpp_to_dtype_count
+    def get_deltas(self) -> CachedMetricsDeltas:
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+        return CachedMetricsDeltas(
+            generated_kernel_count - self.generated_kernel_count,
+            generated_cpp_vec_kernel_count - self.generated_cpp_vec_kernel_count,
+            ir_nodes_pre_fusion - self.ir_nodes_pre_fusion,
+            cpp_to_dtype_count - self.cpp_to_dtype_count,
+        )
+    @staticmethod
+    def apply_deltas(delta: CachedMetricsDeltas):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+        generated_kernel_count += delta.generated_kernel_count
+        generated_cpp_vec_kernel_count += delta.generated_cpp_vec_kernel_count
+        ir_nodes_pre_fusion += delta.ir_nodes_pre_fusion
+        cpp_to_dtype_count += delta.cpp_to_dtype_count
+REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {}
+@dataclass
+class MetricTable:
+    table_name: str
+    column_names: List[str]
+    num_rows_added: int = 0
+    def add_row(self, row_fn):
+        if self.table_name not in enabled_metric_tables():
+            return
+        row_dict = row_fn()
+        assert len(self.column_names) == len(
+            row_dict
+        ), f"{len(self.column_names)} v.s. {len(row_dict)}"
+        assert set(self.column_names) == set(
+            row_dict.keys()
+        ), f"{set(self.column_names)} v.s. {set(row_dict.keys())}"
+        row = [
+            get_benchmark_name(),
+        ]
+        row += [row_dict[column_name] for column_name in self.column_names]
+        self._write_row(row)
+    def output_filename(self):
+        return f"metric_table_{self.table_name}.csv"
+    def write_header(self):
+        filename = self.output_filename()
+        with open(filename, "w") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(["model_name"] + self.column_names)
+    def _write_row(self, row):
+        filename = self.output_filename()
+        if self.num_rows_added == 0 and not os.path.exists(filename):
+            self.write_header()
+        self.num_rows_added += 1
+        for idx, orig_val in enumerate(row):
+            if isinstance(orig_val, float):
+                new_val = f"{orig_val:.6f}"
+            elif orig_val is None:
+                new_val = ""
+            else:
+                new_val = orig_val
+            row[idx] = new_val
+        with open(filename, "a") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(row)
+    @staticmethod
+    def register_table(name, column_names):
+        table = MetricTable(name, column_names)
+        REGISTERED_METRIC_TABLES[name] = table
+MetricTable.register_table(
+    "slow_fusion",
+    [
+        "kernel1_path",
+        "kernel1_latency",
+        "kernel2_path",
+        "kernel2_latency",
+        "fused_kernel_path",
+        "fused_kernel_latency",
+        "slow_down_ratio",
+    ],
+)
+# track the fusion statistics for each graph
+MetricTable.register_table(
+    "graph_stats",
+    [
+        "graph_id",
+        "num_nodes_before_fusion",
+        "num_nodes_after_fusion",
+    ],
+)
+# track the perf difference between persistent reduction and non-persistent
+# reductions
+MetricTable.register_table(
+    "persistent_red_perf",
+    [
+        "kernel1_name",
+        "kernel2_name",
+        "kernel1_latency",
+        "kernel2_latency",
+        "size_hints",
+        "reduction_hint",
+        "speedup",
+    ],
+)
+# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint
+MetricTable.register_table(
+    "kernel_metadata",
+    [
+        "kernel_name",
+        "kernel_path",
+        "kernel_category",  # pointwise/reduction/foreach etc.
+        "size_hints",
+        "reduction_hint",
+        "line_of_code",
+        "num_load",
+        "num_store",
+        "num_for_loop",
+        "num_atomic_add",
+        "num_args",
+        # xyz numel can be different to size_hints since size_hints are rounded
+        # up to the nearest power of 2.
+        # Inductor kernel will burn in the xyz numel in kernel code for static
+        # shape kernels.
+        # Logging them will be helpful to find unaligned shape for reduction
+        "xnumel",
+        "ynumel",
+        "rnumel",
+        "kernel_args_num_gb",
+    ],
+)
+def _parse_kernel_fn_code(kernel_module_code):
+    """
+    The kernel_module_code is the python module that contains kernel function code.
+    kernel function is the proper triton kernel function annotated with
+    @triton.jit
+    """
+    from .codecache import PyCodeCache
+    from .wrapper_benchmark import get_triton_kernel
+    mod = PyCodeCache.load(kernel_module_code)
+    kernel = get_triton_kernel(mod)
+    # kernel is a CachingAutotune; kernel.fn is the JITFunction;
+    # kernel.fn.fn is the function being decorate by triton.jit
+    return inspect.getsource(kernel.fn.fn)
+def _parse_kernel_line_of_code(proper_kernel_fn_code):
+    """
+    Return the line of code for the kernel excluding the decorators.
+    """
+    return len(proper_kernel_fn_code.splitlines())
+def _parse_size_hints(kernel_module_code, kernel_category):
+    if kernel_category == "foreach":
+        # foreach kernel does not have size_hints
+        return None
+    m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code)
+    assert m, "size_hints missing!"
+    return m.group(1)
+def _parse_reduction_hint(kernel_category, kernel_module_code):
+    if kernel_category not in ("reduction", "persistent_reduction"):
+        return None
+    m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code)
+    assert m, "reduction_hint not found in kernel source code!"
+    return m.group(1)
+def _count_pattern(proper_kernel_fn_code, pattern):
+    return proper_kernel_fn_code.count(pattern)
+def _count_args(proper_kernel_fn_code):
+    def_line = proper_kernel_fn_code.splitlines()[0]
+    assert def_line.startswith("def ")
+    start_idx = def_line.index("(")
+    end_idx = def_line.index("):")
+    decl_csv = def_line[start_idx + 1 : end_idx]
+    comps = decl_csv.split(",")
+    return len(comps)
+def _parse_proper_kernel_fn_code(kernel_fn_code):
+    """
+    Skip decorators.
+    """
+    start_pos = kernel_fn_code.index("def ")
+    return kernel_fn_code[start_pos:]
+def _parse_numel(proper_kernel_fn_code, numel_arg_name):
+    m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code)
+    if m:
+        return int(m.group(1))
+    else:
+        return None
+def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category):
+    """
+    inductor meta looks like:
+        inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0},
+    """
+    m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code)
+    if m:
+        return float(m.group(1))
+    else:
+        """
+        There are a few cases that kernel_num_gdb field can be missing:
+        1. the field will be missing if config.benchmark_kernel and
+           config.profile_bandwidth are false
+        2. even if config.benchmark_kernel or config.profile_bandwidth is true.
+           foreach kernel does not have kernel_num_gb field in the metadata
+        """
+        return None
+def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code):
+    """
+    An utility to log kernel metadata. We may parse metadata from kernel source code here.
+    It's fine to parse the generated kernel code here since the logging is
+    disabled by default. It would hurt compilation time.
+    """
+    from .wrapper_benchmark import get_kernel_category_by_source_code
+    kernel_category = get_kernel_category_by_source_code(kernel_module_code)
+    reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code)
+    size_hints = _parse_size_hints(kernel_module_code, kernel_category)
+    kernel_fn_code = _parse_kernel_fn_code(kernel_module_code)
+    proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code)
+    # the line of code excluding the decortors
+    kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code)
+    get_metric_table("kernel_metadata").add_row(
+        lambda: {
+            "kernel_name": kernel_name,
+            "kernel_path": kernel_path,
+            "kernel_category": kernel_category,
+            "size_hints": size_hints,
+            "reduction_hint": reduction_hint,
+            "line_of_code": kernel_line_of_code,
+            "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"),
+            "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"),
+            "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "),
+            "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"),
+            "num_args": _count_args(proper_kernel_fn_code),
+            "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"),
+            "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"),
+            "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"),
+            "kernel_args_num_gb": _parse_kernel_args_num_gb(
+                kernel_fn_code, kernel_category
+            ),
+        }
+    )
+def purge_old_log_files():
+    """
+    Purge the old log file at the beginning when the benchmark script runs.
+    Should do it in the parent process rather than the child processes running
+    each individual model.
+    """
+    for name, table in REGISTERED_METRIC_TABLES.items():
+        if name in enabled_metric_tables():
+            filename = table.output_filename()
+            if os.path.exists(filename):
+                os.unlink(filename)
+            table.write_header()
+@lru_cache
+def enabled_metric_tables() -> Set[str]:
+    config_str = config.enabled_metric_tables
+    enabled = set()
+    for name in config_str.split(","):
+        name = name.strip()
+        if not name:
+            continue
+        assert (
+            name in REGISTERED_METRIC_TABLES
+        ), f"Metric table name {name} is not registered"
+        enabled.add(name)
+    return enabled
+def is_metric_table_enabled(name):
+    return name in enabled_metric_tables()
+def get_metric_table(name):
+    assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
+    return REGISTERED_METRIC_TABLES[name]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import triton
+import triton.language as tl
+# In the latest triton, math functions were shuffled around into different modules:
+# https://github.com/openai/triton/pull/3172
+if hasattr(tl.extra.cuda, "libdevice"):
+    libdevice = tl.extra.cuda.libdevice
+    math = tl.math
+else:
+    libdevice = tl.math
+    math = tl
+@triton.jit
+def promote_to_tensor(x):
+    # Addition promotes to tensor for us
+    return x + tl.zeros((1,), tl.int1)
+@triton.jit
+def is_floating(x):
+    return promote_to_tensor(x).dtype.is_floating()
+@triton.jit
+def _prod_accumulate(a, b):
+    return a * b
+@triton.jit
+def prod(input, axis):
+    return tl.reduce(input, axis, _prod_accumulate)
+@triton.jit
+def minimum(a, b):
+    mask = a < b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+@triton.jit
+def maximum(a, b):
+    mask = a > b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+@triton.jit
+def min2(a, dim):
+    return tl.reduce(a, dim, minimum)
+@triton.jit
+def max2(a, dim):
+    return tl.reduce(a, dim, maximum)
+@triton.jit
+def minimum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value < b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+@triton.jit
+def maximum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value > b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+@triton.jit
+def min_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, minimum_with_index)
+@triton.jit
+def max_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, maximum_with_index)
+@triton.jit
+def welford_reduce(value, mean, m2, weight, first_iteration):
+    if first_iteration:
+        new_weight = tl.full(weight.shape, 1, weight.dtype)
+        new_mean = value
+        new_m2 = tl.zeros_like(m2)
+    else:
+        delta = value - mean
+        new_weight = weight + 1
+        new_mean = mean + delta / new_weight
+        new_m2 = m2 + delta * (value - new_mean)
+    return new_mean, new_m2, new_weight
+@triton.jit
+def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
+    delta = mean_2 - mean_1
+    new_weight = weight_1 + weight_2
+    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
+    return (
+        mean_1 + delta * w2_over_w,
+        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
+        new_weight,
+    )
+@triton.jit
+def welford(mean, m2, weight, dim):
+    return tl.reduce((mean, m2, weight), dim, welford_combine)
+@triton.jit
+def device_assert_then(cond, msg, r):
+    tl.device_assert(cond, msg)
+    return r
+@triton.jit
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+    r0 = r0.to(tl.uint64)
+    r1 = r1.to(tl.uint64)
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % size.to(tl.uint64)
+    result = result.to(tl.int64) + low
+    return result
+@triton.jit
+def _any_combine(a, b):
+    return a | b
+@triton.jit
+def any(a, dim):
+    return tl.reduce(a, dim, _any_combine)
+@triton.jit
+def bucketize_binary_search(
+    values,  # 1D tensor
+    offsets_ptr,
+    indexing_dtype,
+    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
+    OFFSETS_SIZE: int,
+    BLOCK_SHAPE,  # tuple/list of block shape
+):
+    """
+    See [Note: Inductor bucketize op]
+    """
+    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
+    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
+    full_range = OFFSETS_SIZE + 1
+    while full_range > 1:
+        mid = (high + low) // 2
+        mask = mid < OFFSETS_SIZE
+        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask)
+        if right:
+            is_above = values >= bucket_upper_bound
+        else:
+            is_above = values > bucket_upper_bound
+        low = tl.where(is_above & mask, mid + 1, low)
+        high = tl.where(is_above, high, mid)
+        full_range = (full_range + 1) // 2
+    return low
+@triton.jit
+def pack_value_flag(
+    value,
+    flag,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
+    return flag.to(DTYPE_PACK) | (uv << bitwidth)
+@triton.jit
+def unpack_value(
+    pack,
+    DTYPE_VALUE,
+    DTYPE_VALUE_AS_UINT,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
+    return value_uint.to(DTYPE_VALUE, bitcast=True)
+@triton.jit
+def unpack_flag(pack, DTYPE_FLAG):
+    return pack.to(DTYPE_FLAG)
+@triton.jit
+def exclusive_scan_decoupled_lookback(
+    scratch_base,
+    block_value,
+    index,
+    combine_fn,
+    init,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    """Compute exclusive scan of a scalar value between blocks
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
+    DTYPE_PACK: Unsigned type twice the width of block_value
+    NOTE: This function is limited to values which are 32-bits or less.
+    """
+    DTYPE_VALUE = block_value.dtype
+    pack = pack_value_flag(
+        block_value,
+        tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        # tl.atomic_load
+        flag = tl.full([], 0, DTYPE_VALUE_AS_UINT)
+        while flag == 0:
+            pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed")
+            flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT)
+        value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    pack = pack_value_flag(
+        inclusive_prefix,
+        tl.full([], 2, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    return exclusive_prefix
+@triton.jit
+def exclusive_scan_decoupled_lookback_64(
+    scratch_base, block_value, index, combine_fn, init
+):
+    """Compute exclusive scan of a scalar value between blocks
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block, must be 64-bits wide
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    """
+    block_value_u64 = block_value.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 1, block_value_u64)
+    tl.debug_barrier()
+    flag_one = tl.full([], 1, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release")
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        flag = tl.full([], 0, tl.uint64)
+        while flag == 0:
+            flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire")
+        value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32))
+        value = value_u64.to(block_value.dtype, bitcast=True)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64)
+    tl.debug_barrier()
+    flag_two = tl.full([], 2, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release")
+    return exclusive_prefix
+@triton.jit
+def frexp(x):
+    # TODO(isuruf): use inline_asm_elementwise here
+    y = libdevice.ilogb(x) + 1
+    exponent = tl.where(x == 0, 0, y)
+    mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y))
+    return mantissa, exponent

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h ADDED Viewed

	@@ -0,0 +1,401 @@

+//  Copyright © 2022 Apple Inc.
+#pragma once
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSEvent.h>
+#include <ATen/mps/MPSStream.h>
+#include <cstdio>
+#include <mutex>
+#include <set>
+#include <unordered_set>
+#include <mach/vm_page_size.h>
+#include <c10/util/flat_hash_map.h>
+// this implementation is based on CUDACachingAllocator.
+// It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
+// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
+namespace at::mps::HeapAllocator {
+static const size_t kMaxSmallAlloc = MB(1);    // largest "small" allocation is 1 MiB
+static const size_t kMinLargeAlloc = MB(10);   // allocations between 1 and 10 MiB may use kLargeHeap
+static const size_t kRoundLarge    = MB(2);    // round up large allocations to 2 MiB
+static const size_t kSmallHeap     = MB(8);    // "small" allocations are packed in 8 MiB heaps
+static const size_t kLargeHeap     = MB(32);   // "large" allocations may be packed in 32 MiB heaps
+static const size_t kXLargeHeapD   = MB(128);  // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
+static const size_t kXLargeHeapU   = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
+static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
+// buffer pools could be customized with a combination of usage flags
+enum UsageFlags : uint32_t {
+  PRIVATE = 0,
+  SMALL   = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
+  SHARED  = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
+  MANAGED = (1 << 2), // managed storage mode
+  HAZARD  = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
+  SCALAR  = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
+};
+// debug verbosity flags
+enum DebugVerbosity : uint32_t {
+  SILENT      = 0,
+  PROFILING   = (1 << 0), // print generic profiling data for total system memory usage
+  ALLOCATIONS = (1 << 1), // print buffer allocations
+  RECYCLES    = (1 << 2), // print buffer recycling
+  RELEASES    = (1 << 3), // print buffer releases
+  LARGE_ONLY  = (1 << 4), // only log large buffer pool transactions
+};
+struct HeapBlock;
+struct BufferBlock {
+  id<MTLBuffer> buffer;
+  void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
+  size_t size; // size after alignment
+  size_t requested_size; // requested size (before alignment)
+  // buffer shape is used for retrieving base of views in cached graphs
+  std::vector<int64_t> shape;
+  bool in_use = false;
+  HeapBlock* heap;
+  id_t buf_id;
+  // counter to candidate least recently used buffers for garbage collection
+  uint32_t gc_count = 0;
+  uint32_t use_count = 0;
+  // counter to assign unique ids to buffer blocks
+  static uint64_t buffer_counter;
+  // Metal events used to sync GPU/CPU operations on the shared-storage buffers
+  MPSEventPtr event;
+  BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
+              HeapBlock* Heap = nullptr) :
+              buffer(Buffer), size(Size), requested_size(RequestedSize),
+              heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
+  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
+    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
+  }
+  static size_t alignUp(size_t Size, size_t Alignment) {
+    assert(((Alignment - 1) & Alignment) == 0);
+    return ((Size + Alignment - 1) & ~(Alignment - 1));
+  }
+  uint32_t retainCount() const { return [buffer retainCount]; }
+};
+typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
+struct BufferPool;
+struct AllocParams {
+  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
+              search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
+  size_t size() const { return search_key.size; }
+  BufferBlock search_key;
+  BufferPool* pool;
+  BufferBlock* buffer_block = nullptr;
+  size_t requested_size;
+  // true if we exceed the low watermark limit. In this case
+  // we apply strategies to relieve the pressure before allocation.
+  bool has_memory_pressure = false;
+  // true if we're allocating on a unified memory device
+  bool has_unified_memory = true;
+};
+struct HeapBlock {
+  id<MTLHeap> heap;
+  struct { size_t total, available; } size;
+  BufferPool* pool;
+  unsigned int n_buffers = 0;
+  id_t heap_id;
+  // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
+  bool is_split;
+  // counter to assign unique ids to heap blocks
+  static uint64_t heap_counter;
+  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
+            heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
+            heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
+  static MTLResourceOptions getOptions(uint32_t usage) {
+    // TODO: check the caching performance of write-combined mode
+    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;
+    if (usage & UsageFlags::MANAGED)
+      options |= MTLResourceStorageModeManaged;
+    else if (usage & UsageFlags::SHARED)
+      options |= MTLResourceStorageModeShared;
+    else
+      options |= MTLResourceStorageModePrivate;
+    options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
+    return options;
+  }
+  static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
+    HeapBlock *heapBlock = nullptr;
+    bool is_split = true;
+    const size_t size = params.size();
+    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
+    if (d) {
+      const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
+      if (size <= kMaxSmallAlloc) {
+        d.size = kSmallHeap;
+      } else if (size < kMinLargeAlloc) {
+        d.size = kLargeHeap;
+      } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
+        d.size = kXLargeHeap;
+      } else {
+        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        is_split = false;
+      }
+      d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
+      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+      // this automatically handles Metal buffer access synchronizations at the
+      // cost of slightly lower performance.
+      d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
+      d.resourceOptions = getOptions(usage);
+      d.type = MTLHeapTypeAutomatic;
+      id<MTLHeap> heap = [device newHeapWithDescriptor: d];
+      if (heap) {
+        [heap setPurgeableState:MTLPurgeableStateNonVolatile];
+        const size_t heap_size = heapAvailableSize(heap);
+        heapBlock = new HeapBlock(heap_size, heap, params.pool);
+        if (heapBlock) {
+          heapBlock->is_split = is_split;
+        }
+      }
+      [d release];
+    }
+    return heapBlock;
+  }
+  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
+    return (a->size.available != b->size.available) ? a->size.available < b->size.available :
+                                                      (uintptr_t)a->heap < (uintptr_t)b->heap;
+  }
+  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
+    return [heap maxAvailableSizeWithAlignment:Alignment];
+  }
+  NSUInteger Size() {
+    return [heap size];
+  }
+  id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
+    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
+    if (buf) {
+      updateAvailableSize();
+      n_buffers++;
+    }
+    return buf;
+  }
+  // returns the retainCount before releasing the buffer
+  uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
+    const uint32_t retainCount = [buffer retainCount];
+    [buffer release];
+    buffer = nil;
+    updateAvailableSize();
+    n_buffers--;
+    return retainCount;
+  }
+  // returns the retainCount before releasing the heap
+  uint32_t releaseMTLHeap() {
+    const uint32_t retainCount = [heap retainCount];
+    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
+    [heap setPurgeableState:MTLPurgeableStateEmpty];
+    [heap release];
+    heap = nil;
+    size.available = 0;
+    return retainCount;
+  }
+  uint32_t retainCount() const { return [heap retainCount]; }
+  void updateAvailableSize() { size.available = heapAvailableSize(heap); }
+};
+typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
+struct BufferPool {
+  enum class Kind {
+    PRIVATE_SMALL,
+    PRIVATE_LARGE,
+    SHARED_SMALL,
+    SHARED_LARGE,
+    SCALAR,
+  };
+  BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
+             device(Device), usage(Usage),
+             heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
+  const id<MTLDevice> device;
+  // usage flags to customize the pool for various purposes (see UsageFlags enum)
+  const uint32_t usage;
+  // total number of buffers in the pool
+  uint32_t n_buffers = 0;
+  // total allocations size on this pool
+  size_t allocated_size = 0;
+  // total memory available in the pool
+  size_t available_size = 0;
+  // list of heaps ordered by their "available" (not total) memory size
+  std::set<HeapBlock*, HeapComparison> heaps;
+  // list of only "available" buffers in the pool (i.e., buffers not in-use)
+  std::set<BufferBlock*, BufferComparison> available_buffers;
+  // list of buffers that are in a state of "limbo" where they've already been freed
+  // from PyTorch-side, but were not returned to pool due to still being
+  // in-use by command buffers with retainCount > 1. In this state, the buffer is
+  // neither ready to be recycled, nor could be returned to pool as available.
+  // These buffers will be returned to pool once the command buffer's
+  // completionHandler callbacks are called.
+  std::unordered_set<BufferBlock*> buffers_pending_free;
+  // list of heaps pending size update
+  std::unordered_set<HeapBlock*> heaps_pending_update;
+};
+class MPSHeapAllocatorImpl {
+public:
+  explicit MPSHeapAllocatorImpl() :
+    m_device(at::mps::MPSDevice::getInstance()->device()),
+    m_max_buffer_size([m_device maxBufferLength]),
+    m_stream(getDefaultMPSStream()),
+    m_event_pool(getMPSEventPool()) {
+    init_allocator();
+  }
+  ~MPSHeapAllocatorImpl() {
+    emptyCache();
+  }
+  // interface exposed to at::Allocator
+  id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
+  void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
+  void emptyCache();
+  // free inactive buffers that are pending to be freed
+  void freeInactiveBuffers();
+  // returns true if buffer was allocated from the shared pool
+  bool isSharedBuffer(const void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(const void* ptr);
+  // set the shape of a base tensor from a view tensor
+  void setBufferShape(const void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
+  IntArrayRef getBufferShape(const void* ptr);
+  // get the unique ID of the buffer
+  id_t getBufferId(const void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
+  id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
+  // returns a CPU-mapping of the input buffer and its retainCount,
+  // if only it has Shared storage-mode and allocated on MPSAllocator
+  std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
+  // records events for a list of MTLBuffers (list is used to lock the mutex once)
+  // returns true if records any event (given if passed buffers exist and are shared-storage)
+  bool recordEvents(c10::ArrayRef<const void*> buffers);
+  // waits for the event to signal the completion of GPU execution
+  // on the passed shared buffers (list is used to lock the mutex once)
+  // returns true if actually waited on any event
+  bool waitForEvents(c10::ArrayRef<const void*> buffers);
+  // this indicates how far (in Megabytes) the current total allocations are from the
+  // low watermark limit which is used to detect if we're under memory pressure
+  // This returns zero if we've reached the low watermark limit
+  ssize_t getLowWatermarkValue();
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
+  size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
+  // (see m_current_allocated_memory for description)
+  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
+  // total GPU memory allocated in the process by Metal driver; including
+  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
+  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
+  inline id<MTLDevice> Device() const { return m_device; }
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  inline std::string format_size(uint64_t size) const;
+private:
+  // (see m_high_watermark_ratio for description)
+  constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
+  // (see m_low_watermark_ratio for description)
+  // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
+  constexpr static double default_low_watermark_ratio_unified  = 1.4;
+  constexpr static double default_low_watermark_ratio_discrete = 1.0;
+  const id<MTLDevice> m_device;
+  std::recursive_mutex m_mutex;
+  // allocated buffers by device pointer
+  ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
+  // using a container for pools to simplify iterating them
+  ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
+  // total memory allocated by HeapAllocator (including blocks in pools)
+  size_t m_total_allocated_memory = 0;
+  // currently active memory allocations in use (i.e., blocks not in pools)
+  size_t m_current_allocated_memory = 0;
+  // max buffer size allowed by Metal
+  size_t m_max_buffer_size = 0;
+  // maximum total size allowed to be allocated
+  size_t m_max_total_allowed_size = 0;
+  // high watermark ratio is a hard limit for the total allowed allocations
+  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
+  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
+  // allocation size; beyond that, the allocations would fail with OOM error.
+  double m_high_watermark_ratio;
+  // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
+  // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
+  // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
+  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+  // allocation size.
+  double m_low_watermark_ratio;
+  // low watermark size limit (in Bytes) at the time we initialize the allocator
+  size_t m_low_watermark_limit;
+  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
+  uint32_t m_debug_verbosity;
+  // default MPS stream
+  MPSStream* m_stream;
+  // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
+  std::shared_ptr<MPSEventPool> m_event_pool;
+  void init_allocator();
+  void init_buffer_pools();
+  HeapBlock* get_free_heap(AllocParams& params);
+  bool get_free_buffer(AllocParams& params);
+  BufferBlock* get_allocated_buffer_block(const void* ptr);
+  BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
+  bool alloc_buffer(AllocParams& params);
+  void free_buffer(BufferBlock* buffer_block);
+  // returns true if the container heap is also released
+  bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
+  void release_buffers(BufferPool& pool);
+  bool release_available_cached_buffers(AllocParams& params);
+  bool release_cached_buffers();
+  // free unused cached blocks to reclaim GPU memory if memory pressure is high
+  void garbage_collect_cached_buffers(AllocParams& params);
+  // returns the suitable buffer pool type for the usage or
+  // requested/allocated sizes
+  BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
+  // returns the aligned allocation size that is optimized
+  // for the buffers to get reused frequently
+  size_t get_allocation_size(size_t size, uint32_t usage) const;
+  // maximum size of device memory available for allocation in current process
+  // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
+  size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
+  // there are implicit allocations from MPS backend, so we need to query the 'device' for
+  // total allocated size instead of manually tracking in MPSAllocator
+  size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
+  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
+    for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
+      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
+    }
+    return true;
+  }
+};
+} // namespace at::mps::HeapAllocator

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h ADDED Viewed

	@@ -0,0 +1,61 @@

+//  Copyright © 2023 Apple Inc.
+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+#define MB(x) (x * 1048576UL)
+namespace at::mps {
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual void freeInactiveBuffers() const = 0;
+  virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(const void* ptr) const = 0;
+  virtual id_t getBufferId(const void* ptr) const = 0;
+  virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0;
+  virtual bool isSharedBuffer(const void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual std::string formatSize(size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+  virtual size_t getCurrentAllocatedMemory() const = 0;
+  virtual size_t getDriverAllocatedMemory() const = 0;
+  virtual std::pair<const void*, uint32_t> getSharedBufferPtr(const void* ptr) const = 0;
+  virtual bool recordEvents(c10::ArrayRef<const void*> buffers) const = 0;
+  virtual bool waitForEvents(c10::ArrayRef<const void*> buffers) const = 0;
+};
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+    ALLOCATION_FAILED // buffer allocation failed
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+} // namespace at::mps

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h ADDED Viewed

	@@ -0,0 +1,100 @@

+//  Copyright © 2023 Apple Inc.
+#pragma once
+#include <ATen/mps/MPSStream.h>
+#include <ctime>
+#include <stack>
+namespace at::mps {
+// NOTE: don't create instances of this class directly.
+// Use MPSEventPool to acquire instances of MPSEvent.
+class MPSEvent {
+public:
+  explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing);
+  ~MPSEvent();
+  // records an event on the stream
+  void record(bool needsLock, bool syncEvent = false);
+  // makes all future work submitted to the stream wait for this event.
+  bool wait(bool needsLock, bool syncEvent = false);
+  // schedules a notifyListener callback for the event.
+  bool notify(bool needsLock, MTLSharedEventNotificationBlock block);
+  // checks if events are already signaled.
+  bool query() const;
+  // blocks the CPU thread until all the GPU work that were scheduled
+  // prior to recording this event are completed.
+  bool synchronize();
+  // resets this event with new parameters in case it gets reused from the event pool
+  void reset(MPSStream* stream, bool enable_timing);
+  // returns the unique ID of the event instance
+  id_t getID() const { return m_id; }
+  // returns the completion timestamp of the event
+  uint64_t getCompletionTime() const { return m_completion_time; }
+  // if already recorded, waits for cpu_sync_cv to be signaled
+  void waitForCpuSync();
+private:
+  id_t m_id;
+  // enables measuring the completion time of the notifyListener of this event
+  bool m_enable_timing;
+  uint64_t m_signalCounter = 0;
+  MPSStream* m_stream = nullptr;
+  MTLSharedEvent_t m_event = nullptr;
+  MTLSharedEventListener* m_listener = nullptr;
+  // used to sync the events created on this Stream with CPU
+  std::mutex m_cpu_sync_mutex{};
+  std::condition_variable m_cpu_sync_cv{};
+  // CondVar predicate to sync the events created on this Stream with CPU
+  bool m_cpu_sync_completed = false;
+  // used to compute elapsed time
+  uint64_t m_completion_time = 0;
+  void recordLocked(bool syncEvent);
+  bool waitLocked(bool syncEvent);
+  bool notifyLocked(MTLSharedEventNotificationBlock block);
+  void notifyCpuSync();
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+typedef std::unique_ptr<MPSEvent, std::function<void(MPSEvent*)>> MPSEventPtr;
+class MPSEventPool {
+public:
+  explicit MPSEventPool(MPSStream* default_stream);
+  ~MPSEventPool();
+  MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream);
+  void emptyCache();
+  // these are mainly used for MPSHooks and torch.mps.Event() bindings
+  id_t acquireEvent(bool enable_timing);
+  void releaseEvent(id_t event_id);
+  void recordEvent(id_t event_id, bool syncEvent);
+  void waitForEvent(id_t event_id, bool syncEvent);
+  void synchronizeEvent(id_t event_id);
+  bool queryEvent(id_t event_id);
+  // returns elapsed time between two recorded events in milliseconds
+  double elapsedTime(id_t start_event_id, id_t end_event_id);
+private:
+  MPSStream* m_default_stream = nullptr;
+  std::recursive_mutex m_mutex;
+  std::stack<std::unique_ptr<MPSEvent>> m_pool{};
+  // dictionary to associate event IDs with event objects
+  // used to retain in-use events out of the pool
+  // for torch.mps.Event() bindings.
+  std::unordered_map<id_t, MPSEventPtr> m_in_use_events{};
+  uint64_t m_event_counter = 0;
+  std::function<void(MPSEvent*)> m_default_deleter;
+  MPSEvent* getInUseEvent(id_t event_id, bool locked = true);
+};
+// shared_ptr is used to get MPSEventPool destroyed after dependent instances
+std::shared_ptr<MPSEventPool> getMPSEventPool();
+} // namespace at::mps

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h ADDED Viewed

	@@ -0,0 +1,393 @@

+//  Copyright © 2022 Apple Inc.
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <os/signpost.h>
+#include <os/log.h>
+#include <sstream>
+#include <string>
+#include <atomic>
+#include <unordered_map>
+#include <utility>
+#include <ctime>
+namespace at::mps {
+namespace Profiler {
+struct BaseInfo {
+  // profiling info types
+  enum class Type {
+    GRAPH,
+    KERNEL,
+    COPY,
+    CPU_FALLBACK,
+  };
+  BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
+      type(infoType), profileId(Id), handle(Handle) { }
+  virtual ~BaseInfo() = default;
+  // type of profiling info
+  Type type;
+  // unique profile ID for execution instances of operations or copies
+  uint64_t profileId;
+  // ID generated by os_signpost
+  // since it's possible to use event and interval-based signposts at the
+  // same time, we need separate IDs for each.
+  os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
+  // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
+  std::atomic<double> totalGpuTime{0.0};
+  // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
+  std::atomic<double> totalSchedulingTime{0.0};
+  // indicates if the operation or copy execution has completed
+  std::atomic_bool completed{false};
+  // handle used to identify the profile info's instance (usually the pointer)
+  const uintptr_t handle;
+  virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
+  // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
+  static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
+    if (tensor.defined()) {
+      std::stringstream tensorStr;
+      auto deviceType = tensor.device().type();
+      tensorStr << c10::DeviceTypeName(deviceType);
+      // see comments for INCLUDE_BUFFER_ID
+      if (includeBufferId && deviceType == at::kMPS) {
+        id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+        tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
+                  << ":" << buffer.retainCount << ")";
+      }
+      tensorStr << ":"
+                << tensor.scalar_type() << tensor.sizes();
+      return tensorStr.str();
+    } else {
+      return "undefined";
+    }
+  }
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+struct OperationInfo : BaseInfo {
+  OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
+      BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
+  uint64_t runCount = 0;
+  std::string strKey;
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+  // builds a string for a kernel
+  static std::string buildKernelString(const std::string& kernelName,
+                                       const TensorList& tensors,
+                                       bool includeBufferId = false) {
+    std::stringstream kernelStr;
+    kernelStr << kernelName;
+    for (const Tensor& tensor: tensors) {
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+    }
+    return kernelStr.str();
+  }
+};
+struct CpuFbInfo : BaseInfo {
+  CpuFbInfo(uint64_t Id, const std::string& OpName) :
+      BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
+  uint64_t runCount = 0;
+  // the current and total overhead of copies in bytes required to convert the Op's
+  // input tensors from MPS to CPU and then output from CPU back to MPS
+  size_t currentCopyOverhead = 0;
+  size_t totalCopyOverhead = 0;
+  std::string opName;
+  std::string strKey;
+  uint64_t startTime = 0;
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+  void updateCopyOverhead(const TensorList& tensors) {
+    currentCopyOverhead = 0;
+    for (const Tensor& tensor: tensors) {
+      if (tensor.defined()) {
+        currentCopyOverhead += tensor.nbytes();
+      }
+    }
+    totalCopyOverhead += currentCopyOverhead;
+  }
+};
+struct CopyInfo : BaseInfo {
+  enum class Kind {
+    MPS_TO_MPS,
+    MPS_TO_CPU,
+    CPU_TO_MPS,
+  };
+  CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
+           BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
+           length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
+  Kind kind;
+  size_t length;
+  bool isNonBlocking;
+  bool usesBlitter;
+  std::string srcStrKey;
+  std::string dstStrKey;
+  // for copies that don't use blitters, we measure CPU time
+  uint64_t startTime = 0;
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+  static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
+  static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
+    if (tensor.has_value()) {
+      return tensor->device().type() == at::kMPS;
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
+    // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
+    return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
+  }
+  static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
+                          const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
+    const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
+    const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
+    if (isSrcOnMPS && !isDstOnMPS) {
+      return Kind::MPS_TO_CPU;
+    } else if (!isSrcOnMPS && isDstOnMPS) {
+      return Kind::CPU_TO_MPS;
+    }
+    return Kind::MPS_TO_MPS;
+  }
+};
+struct CopyStat : CopyInfo {
+  explicit CopyStat(std::string CopyKindStr) :
+          CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
+  // total number of copies
+  size_t totalCount = 0;
+  // number of Scalar copies (i.e., less than sizeof(int64))
+  size_t scalarsCount = 0;
+  // number of blocking copies (i.e., require syncing to GPU)
+  size_t blockingCount = 0;
+  // number of copies that used memcpy(), instead of Metal Blit Encoder
+  size_t memcpyCount = 0;
+  // accumulated GPU time in ms for the scalar copies
+  std::atomic<double> scalarsGpuTime{0.0};
+  // copy kind in string type
+  std::string kindStr;
+};
+class MPSProfiler {
+public:
+  // lower 16 bits used for profiler options
+  enum ProfileOptions : uint32_t {
+    OPTIONS_NONE = 0,
+    // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
+    // (used for convenience to not compute bit flags by OR-ing manually)
+    // trace all signpost types using events
+    ALL_SIGNPOST_EVENTS    = (1 << 0),
+    // trace all signpost types using intervals
+    ALL_SIGNPOST_INTERVALS = (1 << 1),
+    // always wait for command buffer to finish executing after each commit
+    WAIT_UNTIL_COMPLETED   = (1 << 2),
+    // for interval-based signposts, include the scheduling portion of
+    // Graph/Kernel/Copy executions as well.
+    // if flag is disable, only "GPU run time" is included in interval,
+    // and not schedule time.
+    INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
+    // use these if you need to trace signposts types individually (rarely required)
+    // trace signpost using intervals
+    USE_INTERVALS = (1 << 4),
+    // trace signpost by emitting events
+    USE_EVENTS    = (1 << 5),
+    // used for sanity check (Change this when new option added)
+    OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
+  };
+  // when adding new types, #define the type string in MPSProfiler.mm as well.
+  // upper 16 bits used for event types
+  enum SignpostTypes : uint32_t {
+    SIGNPOST_NONE = 0,
+    // trace signposts for PyTorch operation executions
+    RUN_OPERATION = (1 << 16),
+    // trace signposts for blitter copies
+    BLIT_COPY     = (1 << 17),
+    // trace signposts for ops that fall back on CPU
+    CPU_FALLBACK  = (1 << 18),
+    // used for sanity check (Change this when new type added)
+    SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
+  };
+  enum LogOptions : uint32_t {
+    LOG_NONE = 0,
+    // Info logging options during execution
+    // -------------------------------------
+    // prints operation info (id/key/run_count) during execution
+    OPERATION_INFO      = (1 << 0),
+    // prints copy info (src/dst tensors/buffers, size, etc.) during execution
+    COPY_INFO           = (1 << 1),
+    // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
+    CPU_FALLBACK_INFO   = (1 << 2),
+    // Profiling Statistics logging options when process terminates
+    // ------------------------------------------------------------
+    // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
+    // this is convenient to not combine following stats bit flags manually
+    ALL_STATS           = (1 << 3),
+    // prints operation stats (GPU times, run count, etc.) before process terminates
+    OPERATION_STATS     = (1 << 4),
+    // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
+    COPY_STATS          = (1 << 5),
+    // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
+    // for tensors, etc.) before process terminates
+    CPU_FALLBACK_STATS  = (1 << 6),
+    // Metadata format options when logging the info
+    // ---------------------------------------------
+    // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
+    // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
+    INCLUDE_GPU_TIME    = (1 << 7),
+    // if enabled, includes GPU scheduling time in metadata separately
+    // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
+    // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
+    INCLUDE_KERNEL_TIME = (1 << 8),
+    // if enabled, includes the unique buffer ID in metadata for the storage
+    // of a tensor that was allocated on MPSAllocator. This is useful (along with
+    // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
+    // with various operations.
+    INCLUDE_BUFFER_ID   = (1 << 9),
+    // used for sanity check (Change this when new option added)
+    LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
+  };
+  explicit MPSProfiler();
+  ~MPSProfiler();
+  // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
+  uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
+  uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
+  uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
+                            const OptionalTensorRef srcTensor,
+                            const OptionalTensorRef dstTensor,
+                            size_t length, bool isNonBlocking, bool usesBlitter = true);
+  uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
+  void beginProfileGPUInterval(const void* handle);
+  void endProfileCopy(uint64_t profileId, SyncType syncType);
+  void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
+  void endProfileCPUFallback(const std::string& opName);
+  // these are used to hook into Python bindings for torch.mps.profiler module.
+  // this enables generating OS Signpost traces from MPSProfiler on-demand
+  // during runtime (instead of environment variables).
+  // The "mode" could be either "interval", "event", or both "interval,event"
+  // for interval-based and/or event-based signpost tracing.
+  void StartTrace(const string& mode, bool waitUntilCompleted);
+  void StopTrace();
+  // convenience functions to indicate whether signpost tracing or
+  // logging are enabled for the SignpostTypes
+  bool isOperationProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
+           (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
+  }
+  bool isCopyProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
+           (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
+  }
+  bool isCPUFallbackProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
+           (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
+  }
+  bool isSignpostTracingEnabled() const {
+    return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
+  }
+ private:
+  // indicates what type of signpost types are enabled and traced by MPS profiler.
+  uint32_t m_signpost_types = 0;
+  uint32_t m_profile_options = 0;
+  uint32_t m_log_options = 0;
+  uint64_t m_kernel_counter = 0;
+  uint64_t m_graph_counter = 0;
+  uint64_t m_cpu_fb_counter = 0;
+  uint64_t m_copy_counter = 0;
+  // technically, it's possible to trace both events and intervals at the same time
+  // so we use separate os_log categories for them
+  os_log_t m_os_log_events;
+  os_log_t m_os_log_intervals;
+  // stats logging could run either from destructor or signal handler
+  // so this is used to check if logging has already started.
+  std::atomic_bool hasLoggedStats{false};
+  // indicates there are pending completionHandler callbacks that haven't been called yet.
+  std::atomic_bool hasPendingCompletionHandlers{false};
+  // used to capture sigint signal to log profiling stats
+  static struct sigaction currentSigint, previousSigint;
+  // We use the following lists for two reasons:
+  // 1- for interval-based signposts the "begin" point won't be in same function
+  // as the "end" point where we need to be able to retrieve signpost's info
+  // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
+  // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
+  // the string key for this map is the op name that we fall back to execute on CPU
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
+  // this list contains the info for copies, and its key is the unique profileId
+  // which is generated from m_copy_counter
+  // The copyInfo list is not retained.
+  std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
+  // a short list that contains copy stats
+  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
+  void initialize();
+  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
+  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
+                           os_signpost_id_t interval_signpost_id,
+                           double gpuTime, double schedulingTime);
+  void addProfilerScheduledHandler(BaseInfo& info);
+  void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
+  void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                         const std::string& msg) const;
+  void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                             const std::string& msg) const;
+  void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
+  void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
+  // returns true if logging the profiling info "during the execution" is enabled
+  bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
+  // logs all the profiling stats that are enabled
+  void logProfilingStats();
+  // logs kernel profiling stats when the process ends.
+  void logOperationsProfilingStats(std::FILE* f) const;
+  // logs CPU Fallback profiling stats when the process ends.
+  void logCPUFallbackProfilingStats(std::FILE* f) const;
+  // logs copy profiling stats when the process ends.
+  void logCopyProfilingStats(std::FILE* f) const;
+  os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
+  static SignpostTypes getSignpostType(BaseInfo::Type infoType);
+  static void handleIntSignal(int signal);
+};
+} // namespace Profiler
+Profiler::MPSProfiler& getMPSProfiler();
+} // namespace at::mps

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h ADDED Viewed

	@@ -0,0 +1,321 @@

+#pragma once
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+#include <ATen/Config.h>
+#include <ATen/native/DispatchStub.h>
+// Forward declare TI
+namespace at {
+class Tensor;
+struct TensorIterator;
+namespace native {
+enum class TransposeType;
+}
+}
+namespace at::native {
+enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss};
+#if AT_BUILD_WITH_LAPACK()
+// Define per-batch functions to be used in the implementation of batched
+// linear algebra operations
+template <class scalar_t>
+void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info);
+template <class scalar_t>
+void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info);
+template <class scalar_t, class value_t=scalar_t>
+void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info);
+template <class scalar_t>
+void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+template <class scalar_t>
+void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+template <class scalar_t>
+void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info);
+template <class scalar_t>
+void lapackGels(char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsd(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t* work, int lwork,
+    value_t *rwork, int* iwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsy(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    int *jpvt, value_t rcond, int *rank,
+    scalar_t *work, int lwork, value_t* rwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelss(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t *work, int lwork,
+    value_t *rwork, int *info);
+template <LapackLstsqDriverType, class scalar_t, class value_t = scalar_t>
+struct lapackLstsq_impl;
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gels, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGels<scalar_t>(
+        trans, m, n, nrhs,
+        a, lda, b, ldb,
+        work, lwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsy, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsy<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        jpvt, rcond, rank,
+        work, lwork, rwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsd, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsd<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, iwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelss, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelss<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, info);
+  }
+};
+template <LapackLstsqDriverType driver_type, class scalar_t, class value_t = scalar_t>
+void lapackLstsq(
+    char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info, // Gels flavor
+    int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+    value_t *s, // Gelss flavor
+    int *iwork // Gelsd flavor
+    ) {
+  lapackLstsq_impl<driver_type, scalar_t, value_t>::call(
+      trans, m, n, nrhs,
+      a, lda, b, ldb,
+      work, lwork, info,
+      jpvt, rcond, rank, rwork,
+      s,
+      iwork);
+}
+template <class scalar_t>
+void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
+template <class scalar_t>
+void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
+template <class scalar_t>
+void lapackLdlHermitian(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+template <class scalar_t>
+void lapackLdlSymmetric(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+template <class scalar_t>
+void lapackLdlSolveHermitian(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+template <class scalar_t>
+void lapackLdlSolveSymmetric(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+template<class scalar_t, class value_t=scalar_t>
+void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
+#endif
+#if AT_BUILD_WITH_BLAS()
+template <class scalar_t>
+void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb);
+#endif
+using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_fn, cholesky_stub);
+using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub);
+using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub);
+using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(geqrf_fn, geqrf_stub);
+using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(orgqr_fn, orgqr_stub);
+using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/);
+DECLARE_DISPATCH(ormqr_fn, ormqr_stub);
+using linalg_eigh_fn = void (*)(
+    const Tensor& /*eigenvalues*/,
+    const Tensor& /*eigenvectors*/,
+    const Tensor& /*infos*/,
+    bool /*upper*/,
+    bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub);
+using lstsq_fn = void (*)(
+    const Tensor& /*a*/,
+    Tensor& /*b*/,
+    Tensor& /*rank*/,
+    Tensor& /*singular_values*/,
+    Tensor& /*infos*/,
+    double /*rcond*/,
+    std::string /*driver_name*/);
+DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
+using triangular_solve_fn = void (*)(
+    const Tensor& /*A*/,
+    const Tensor& /*B*/,
+    bool /*left*/,
+    bool /*upper*/,
+    TransposeType /*transpose*/,
+    bool /*unitriangular*/);
+DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub);
+using lu_factor_fn = void (*)(
+    const Tensor& /*input*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*infos*/,
+    bool /*compute_pivots*/);
+DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub);
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  const int64_t dim_size,
+  const int64_t max_pivot);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+using lu_solve_fn = void (*)(
+    const Tensor& /*LU*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*B*/,
+    TransposeType /*trans*/);
+DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub);
+using ldl_factor_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*info*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub);
+using svd_fn = void (*)(
+    const Tensor& /*A*/,
+    const bool /*full_matrices*/,
+    const bool /*compute_uv*/,
+    const c10::optional<c10::string_view>& /*driver*/,
+    const Tensor& /*U*/,
+    const Tensor& /*S*/,
+    const Tensor& /*Vh*/,
+    const Tensor& /*info*/);
+DECLARE_DISPATCH(svd_fn, svd_stub);
+using ldl_solve_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*result*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h ADDED Viewed

	@@ -0,0 +1,139 @@

+#include <ATen/core/Tensor.h>
+#include <ATen/Config.h>
+#include <cstdint>
+#ifdef USE_FBGEMM
+#include <fbgemm/FbgemmEmbedding.h>
+#endif
+namespace at::native {
+void check_arguments(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    bool include_last_offset);
+void make_bag_size_out(
+    Tensor& bag_size_out,
+    const Tensor& offsets,
+    const Tensor& indices,
+    const int64_t mode,
+    const bool include_last_offset,
+    const bool requires_grad);
+void make_max_indices_out(
+    Tensor& max_indices_out,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& bag_size,
+    const int64_t mode,
+    bool include_last_offset);
+void make_offset2bag_out(
+    Tensor& offset2bag,
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    const int64_t padding_idx = -1);
+#ifdef USE_FBGEMM
+template<bool has_weight, typename TIndex, typename TData>
+struct _CallbackAndBlockSize {
+    using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature<TData, TIndex, TIndex, TData>::Type;
+    int64_t blockSize = -1;
+    TCallback callback = nullptr;
+    static TCallback generateCallback(int64_t block_size) {
+        return fbgemm::GenerateEmbeddingSpMDM<TData, TIndex, TIndex, TData>(
+                block_size,
+                has_weight,
+                /* normalize_by_lengths */false,
+                /* prefetch */16,
+                /* is_weight_positional */false,
+                /* use_offsets */true);
+    }
+    _CallbackAndBlockSize() = default;
+    explicit _CallbackAndBlockSize(c10::optional<int64_t> maybe_block_size)
+      : blockSize(maybe_block_size.value_or(-1))
+      , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
+    {}
+};
+template<typename... StorageMixins>
+struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
+    _EmbeddingBagKernelCacheImpl() = default;
+    // use each of the mixins to store corresponding kernel and block size
+    explicit _EmbeddingBagKernelCacheImpl(c10::optional<int64_t> maybe_block_size)
+      : StorageMixins(maybe_block_size)...
+    {}
+    // this method is thread safe (call sites may call from different threads)
+    template<bool has_weight, typename TIndex, typename TData>
+    typename _CallbackAndBlockSize<has_weight, TIndex, TData>::TCallback
+    getCallback(int64_t block_size) const {
+        // if the cache doesn't store the kernel for the incoming block size
+        // (so it is different from the one stored in corresponding mixin)
+        // regenerate the kernel (not writing it into the cache so we avoid locks)
+        if (block_size != _CallbackAndBlockSize<has_weight, TIndex, TData>::blockSize) {
+            return _CallbackAndBlockSize<has_weight, TIndex, TData>::generateCallback(block_size);
+        }
+        // else retrieve the cached kernel from the corresponding mixin
+        return _CallbackAndBlockSize<has_weight, TIndex, TData>::callback;
+    }
+};
+// instantiate the cache with the list of storage mixins
+// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
+using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
+    _CallbackAndBlockSize<true, int32_t, float>,
+    _CallbackAndBlockSize<false, int32_t, float>,
+    _CallbackAndBlockSize<true, int64_t, float>,
+    _CallbackAndBlockSize<false, int64_t, float>,
+    _CallbackAndBlockSize<true, int32_t, unsigned short>,
+    _CallbackAndBlockSize<false, int32_t, unsigned short>,
+    _CallbackAndBlockSize<true, int64_t, unsigned short>,
+    _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+#else
+struct _EmbeddingBagKernelCache {
+    explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
+};
+#endif
+void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
+    Tensor& bag_size, Tensor* max_indices,
+    const Tensor &weight, const Tensor &indices,
+    const Tensor &offsets, const int64_t mode = 0,
+    const c10::optional<Tensor>& per_sample_weights = c10::nullopt,
+    bool include_last_offset = false,
+    int64_t padding_idx = -1,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    const bool sparse,
+    const c10::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const c10::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h ADDED Viewed

	@@ -0,0 +1,21 @@

+// Functions that fill Tensors with constants. Implementations are in Fill.cpp.
+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+class Tensor;
+struct TensorIterator;
+namespace native {
+DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub);
+Tensor& fill_out(Tensor& self, const Scalar& value);
+}} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h ADDED Viewed

	@@ -0,0 +1,72 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+namespace at::native {
+namespace {
+  static C10_UNUSED void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+          target.dim() <= 1 && target.numel() == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+          target.dim() == 2 && target.size(0) == nframe &&
+          target.size(1) == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    }
+  }
+  static C10_UNUSED void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+    TORCH_CHECK(
+        target.dim() <= 1 && target.numel() == nframe,
+        "inconsistent target size, expected ", nframe, " but got ",
+        target.sizes());
+    if (weight && weight->defined()) {
+      TORCH_CHECK(
+          weight->dim() <= 1 && weight->numel() == dim,
+          "inconsistent weight size, expected ", dim, " but got ",
+          weight->sizes());
+    }
+}
+}  // anonymous namespace
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h ADDED Viewed

	@@ -0,0 +1,11 @@

+#pragma once
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+namespace at::native {
+using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm);
+DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub);
+}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h ADDED Viewed

	@@ -0,0 +1,69 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+namespace native {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define HOST_DEVICE __host__ __device__
+#else
+#define HOST_DEVICE
+#endif
+// integral power in pytorch allows for negative exponents, giving truncated integral results.
+// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the
+// only non-zero result.
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE __ubsan_ignore_signed_int_overflow__ T powi_impl(T a, T b) {
+  T result = 1;
+  while (b) {
+    if (b & 1) {
+       result *= a;
+    }
+    b /= 2;
+    a *= a;
+  }
+  return result;
+}
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  return powi_impl(a, b);
+}
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  if ( b < 0 ) {
+      if ( a == 1 ) {
+          return 1;
+      } else if ( a == -1 ) {
+          auto negative = (-b) % static_cast<T>(2);
+          return negative ? -1 : 1;
+      } else {
+          return 0;
+      }
+  }
+  return powi_impl(a, b);
+}
+using pow_tensor_tensor_fn = void (*)(TensorIteratorBase&);
+using pow_tensor_scalar_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+DECLARE_DISPATCH(pow_tensor_tensor_fn, pow_tensor_tensor_stub);
+DECLARE_DISPATCH(pow_tensor_scalar_fn, pow_tensor_scalar_stub);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h ADDED Viewed

	@@ -0,0 +1,56 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+class Tensor;
+}
+namespace at::native {
+using reduce_fn = void(*)(TensorIterator &);
+DECLARE_DISPATCH(reduce_fn, sum_stub);
+DECLARE_DISPATCH(reduce_fn, nansum_stub);
+DECLARE_DISPATCH(reduce_fn, prod_stub);
+DECLARE_DISPATCH(reduce_fn, mean_stub);
+DECLARE_DISPATCH(reduce_fn, and_stub);
+DECLARE_DISPATCH(reduce_fn, or_stub);
+DECLARE_DISPATCH(reduce_fn, min_values_stub);
+DECLARE_DISPATCH(reduce_fn, max_values_stub);
+DECLARE_DISPATCH(reduce_fn, argmax_stub);
+DECLARE_DISPATCH(reduce_fn, argmin_stub);
+using reduce_std_var_function =
+    void (*)(TensorIterator&, double correction, bool take_sqrt);
+DECLARE_DISPATCH(reduce_std_var_function, std_var_stub);
+using reduce_norm_fn =
+    void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&);
+DECLARE_DISPATCH(reduce_fn_flag, norm_stub);
+using structured_cum_fn = void (*)(const Tensor&, const Tensor&, int64_t);
+using cum_fn = void (*)(Tensor&, const Tensor&, int64_t);
+DECLARE_DISPATCH(structured_cum_fn, cumsum_stub);
+DECLARE_DISPATCH(structured_cum_fn, cumprod_stub);
+DECLARE_DISPATCH(cum_fn, logcumsumexp_stub);
+DECLARE_DISPATCH(void (*)(const Tensor&, int64_t, bool, Tensor&, Tensor&), aminmax_stub);
+DECLARE_DISPATCH(void (*)(const Tensor&, Tensor&, Tensor&), aminmax_allreduce_stub);
+// Used in cuda/Normalization.cu
+TORCH_API std::tuple<Tensor&,Tensor&> var_mean_out(
+    Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim,
+    int64_t correction, bool keepdim);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h ADDED Viewed

	@@ -0,0 +1,55 @@

+/// This file contains some tensor-agnostic operations to be used in the
+/// core functions of the `SobolEngine`
+#include <ATen/core/Tensor.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/pow.h>
+#endif
+namespace at::native::sobol_utils {
+/// Function to return the minimum of number of bits to represent the integer `n`
+inline int64_t bit_length(const int64_t n) {
+  int64_t nbits, nloc;
+  for (nloc = n, nbits = 0; nloc > 0; nloc /= 2, nbits++);
+  return nbits;
+}
+/// Function to get the position of the rightmost zero in the bit representation of an integer
+/// This value is the zero-indexed position
+inline int64_t rightmost_zero(const int64_t n) {
+  int64_t z, i;
+  for (z = n, i = 0; z % 2 == 1; z /= 2, i++);
+  return i;
+}
+/// Function to get a subsequence of bits in the representation of an integer starting from
+/// `pos` and of length `length`
+inline int64_t bitsubseq(const int64_t n, const int64_t pos, const int64_t length) {
+  return (n >> pos) & ((1 << length) - 1);
+}
+/// Function to perform the inner product between a batched square matrix and a power of 2 vector
+inline at::Tensor cdot_pow2(const at::Tensor& bmat) {
+  at::Tensor inter = at::arange(bmat.size(-1) - 1, -1, -1, bmat.options());
+  inter = at::pow(2, inter).expand_as(bmat);
+  return at::mul(inter, bmat).sum(-1);
+}
+/// All definitions below this point are data. These are constant, and should not be modified
+/// without notice
+constexpr int64_t MAXDIM = 21201;
+constexpr int64_t MAXDEG = 18;
+constexpr int64_t MAXBIT = 30;
+constexpr int64_t LARGEST_NUMBER = 1 << MAXBIT;
+constexpr float RECIPD = 1.0 / LARGEST_NUMBER;
+extern const int64_t poly[MAXDIM];
+extern const int64_t initsobolstate[MAXDIM][MAXDEG];
+} // namespace at::native::sobol_utils

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+class Tensor;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+namespace at::native {
+using reduce_minmax_fn =
+    void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+using structured_reduce_minmax_fn =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, int64_t, bool);
+DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub);
+DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub);
+using where_fn = void (*)(TensorIterator &);
+DECLARE_DISPATCH(where_fn, where_kernel);
+using is_infinity_op_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub);
+DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub);
+using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+DECLARE_DISPATCH(mode_fn, mode_stub);
+using clamp_tensor_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub);
+namespace detail {
+    enum class ClampLimits {Min, Max, MinMax};
+}
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub);
+using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
+DECLARE_DISPATCH(isin_default_fn, isin_default_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #pragma once
2	+ #include <ATen/TensorIterator.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h ADDED Viewed

	@@ -0,0 +1,57 @@

+#include <ATen/core/Tensor.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+namespace at::native {
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches for Triu and Tril. This ignores stride 0 dimension
+ */
+static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    if (batched_matrices.stride(i) != 0) {
+      result *= batched_matrices.size(i);
+    }
+  }
+  return result;
+}
+/* Checks a necessary property for the triu and tril implementations, hence the name.
+ * Here batch contiguity is checked for tensors with greater than 4 dimensions.
+ * Contiguous tensors and tensors with less than 3 dimensions pass this check
+ */
+static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) {
+  // Complete contiguity is the most desired property, which is why
+  // we return true if the tensor is contiguous
+  if (tensor.is_contiguous()) {
+    auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes());
+    if (tensor.strides() == default_strides_for_size) {
+      return std::make_tuple(true, tensor);
+    } else {
+      return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size));
+    }
+  }
+  int64_t dims = tensor.dim();
+  // Tensors with dimension less than 4 are handled by default
+  if (allow_zero_stride && dims <= 3) {
+    return std::make_tuple(true, tensor);
+  }
+  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
+  for (int64_t i = dims - 3; i >= 0; i--) {
+    // Skip trivial dimension;
+    if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) {
+      continue;
+    }
+    if (expected_stride != tensor.stride(i)) {
+      return std::make_tuple(false, tensor.contiguous());
+    }
+    expected_stride *= tensor.size(i);
+  }
+  return std::make_tuple(true, tensor);
+}
+}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h ADDED Viewed

	@@ -0,0 +1,62 @@

+#pragma once
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+// n: number of function arguments (arity)
+// traits: function_traits (see FunctionTraits.h)
+// s: index of scalar argument or -1
+template <int n, int stride_index, typename traits, int s=-1>
+struct IsContiguous {
+  static bool eval(const int64_t* strides) {
+    using type = typename traits::template arg<n - 1>::type;
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
+  }
+};
+// will be called when there is an output exists
+template <typename traits, int s>
+struct IsContiguous<0, 0, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return strides[0] == sizeof(typename traits::result_type);
+  }
+};
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* /*strides*/) {
+    return true;
+  }
+};
+// output and all inputs are contiguous
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+template <typename traits,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
+}
+// input at `s` is scalar (stride 0); output and other inputs are contiguous
+// NB: output is typically at strides[0] so first input corresponds to s=1
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+template <typename traits, int s,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
+}
+}}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+namespace at {
+class Tensor;
+namespace native {
+using forward_fn = void (*)(const Tensor&, const Tensor&);
+using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&);
+DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel);
+DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel);
+using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t);
+using backward_fn_with_dim =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t);
+DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel);
+DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh ADDED Viewed

	@@ -0,0 +1,296 @@

+#pragma once
+#include <ATen/jit_macros.h>
+// Jiterator functions are guarded behind this macro
+#if AT_USE_JITERATOR()
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/SmallBuffer.h>
+#include <initializer_list>
+#include <type_traits>
+#include <tuple>
+#include <mutex>
+namespace at {
+namespace native {
+template <typename Tuple, std::size_t... I>
+constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence<I...> seq) {
+    constexpr auto size = seq.size();
+    (void)t; // warning : unused parameter when tuple is empty.
+    return std::array<void*, size>{static_cast<void*>(&std::get<I>(t))...};
+}
+// Helper function convert tuple to std::array<void*, N>
+// for passing the arguments to CUDA Kernel
+// NOTE: We capture tuple by reference,
+// so the pointers in returned array are only valid
+// till tuple is alive.
+template <typename ...Args>
+constexpr auto tuple_to_array(std::tuple<Args...>& extra_args) {
+    constexpr auto tuple_size = sizeof...(Args);
+    return tuple_to_array_helper(extra_args, std::make_index_sequence<tuple_size>{});
+}
+struct JittedVecKernelCache {
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  at::cuda::jit::NvrtcFunction vec1;
+  at::cuda::jit::NvrtcFunction vec2;
+  at::cuda::jit::NvrtcFunction vec4;
+};
+struct JittedKernelVariantCache {
+  JittedVecKernelCache vec;
+  at::cuda::jit::NvrtcFunction noncontiguous;
+  at::cuda::jit::NvrtcFunction dynamic_contiguous;
+  at::cuda::jit::NvrtcFunction dynamic_noncontiguous;
+};
+inline c10::SmallBuffer<void*, 64> pack_kernel_args(
+    std::initializer_list<void*> args,
+    c10::ArrayRef<void*> extra_args) {
+  c10::SmallBuffer<void*, 64> ret(args.size() + extra_args.size());
+  std::copy(args.begin(), args.end(), ret.data());
+  std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size());
+  return ret;
+}
+template<typename array_t,
+         typename inp_calc_t,
+         typename out_calc_t,
+         typename loader_t,
+         typename storer_t>
+void launch_jitted_unrolled_kernel(
+    std::mutex &jiterator_mutex,
+    at::cuda::jit::NvrtcFunction &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    int64_t N,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s,
+    bool contiguous,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void* scalar_val,
+    c10::ArrayRef<void*> extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+  if (!fn_cache.function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_cache.function) {
+      constexpr bool dynamic_casting = !std::is_same<decltype(l), memory::LoadWithoutCast>() ||
+                                       !std::is_same<decltype(s), memory::StoreWithoutCast>();
+      auto code = at::cuda::jit::generate_code(
+          desc, contiguous, dynamic_casting, scalar_pos);
+      fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name);
+    }
+  }
+  auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+  at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u},
+  {num_threads(), 1u, 1u});
+}
+template<int arity, typename array_t>
+void launch_jitted_vectorized_kernel(
+    std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void *scalar_val, c10::ArrayRef<void*> extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+  const int vec_size = at::cuda::jit::can_vectorize_up_to(
+      desc, c10::ArrayRef<char*>(data.data, data.size()));
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  if (vec_size == 4) {
+    fn_ptr = &fn_cache.vec4;
+  } else if (vec_size == 2) {
+    fn_ptr = &fn_cache.vec2;
+  } else if (vec_size ==1) {
+    fn_ptr = &fn_cache.vec1;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel");
+  }
+  bool vectorized = vec_size > 1;
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_ptr->function) { // cache miss!
+      // Generates program
+      auto code = at::cuda::jit::generate_code(
+          desc, /*contiguous=*/true, /*dynamic_casting=*/false,
+          scalar_pos, vectorized, vec_size);
+      std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name;
+      // Acquires the program
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+  if (vectorized) {
+    auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  } else {
+// NVCC complains about unused variables l and s.
+// It should be false positive in most cases, so we suppress the warnings.
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
+    auto ic = TrivialOffsetCalculator<arity>();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+    auto args = pack_kernel_args(
+        {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+#pragma nv_diagnostic pop
+  }
+}
+template <int arity>
+void jitted_gpu_kernel_generic(
+    std::mutex &jiterator_mutex,
+    JittedKernelVariantCache &cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    c10::ArrayRef<void*> extra_args,
+    TensorIteratorBase& iter,
+    const bool dynamic_casting,
+    void *scalar_val) {
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  constexpr int ntensors = arity + 1;
+  at::detail::Array<char*, ntensors> data;
+  for (auto i : c10::irange(ntensors)) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel<arity>(
+          jiterator_mutex, cache.vec, desc,
+          numel, data, scalar_pos, scalar_val, extra_args);
+      return;
+    }
+    // Case 2: no dynamic casting and noncontiguous
+    auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.noncontiguous, desc, numel, data,
+        input_offset_calculator, output_offset_calculator, loader,
+        storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast<1>(iter);
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  auto loader = memory::LoadWithCast<arity>(iter);
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    auto input_offset_calculator = TrivialOffsetCalculator<arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator,
+        output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+  // Case 4: dynamic casting and noncontiguous
+  auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  launch_jitted_unrolled_kernel(
+      jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+}
+// NOTE: static to reduce chances of name collision.
+template <
+    char const* name,
+    typename result_type,
+    typename f_inputs_type,
+    int arity,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    typename... ExtraArgs>
+static void jitted_gpu_kernel_impl(
+    TensorIteratorBase& iter,
+    const std::string &f,
+    const bool dynamic_casting,
+    at::opmath_type<f_inputs_type> scalar_val,
+    std::tuple<ExtraArgs...> extra_args) {
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+  static std::mutex jiterator_mutex;
+  static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
+  constexpr int nInputs = arity;
+  constexpr int nOutputs = 1;  // TODO: Support more than 1 output
+  static const auto desc = at::cuda::jit::make_kernel_descriptor<
+    result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs);
+  auto &cache = device_caches[iter.device().index()];
+  auto extra_args_array = tuple_to_array(extra_args);
+  return jitted_gpu_kernel_generic<arity>(
+      jiterator_mutex,
+      cache,
+      desc,
+      scalar_pos,
+      extra_args_array,
+      iter,
+      dynamic_casting,
+      &scalar_val
+    );
+}
+}}  // at::native
+#endif // AT_USE_JITERATOR()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h ADDED Viewed

	@@ -0,0 +1,32 @@

+#pragma once
+#include <array>
+#include <cstdint>
+namespace at {
+class TensorBase;
+}
+namespace at {
+namespace native {
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+}}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh ADDED Viewed

	@@ -0,0 +1,384 @@

+#pragma once
+#include <cstdint>
+#include <type_traits>
+#include <c10/core/DynamicCast.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+#include <thrust/tuple.h>
+// References:
+// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
+namespace at { namespace native { namespace memory {
+namespace detail {
+// What does the `static_unroll` do?
+//
+// We want to do something like:
+//
+//    using args_t = typename traits::ArgsTuple;
+//    args_t args;
+//    #pragma unroll
+//    for (int i = 0; i < traits::arity; i++) {
+//      std::get<i>(args) = ....
+//    }
+//
+// but unfortunately the above code does not work because
+// the template argument has to be a compile time constant
+// so `static_unroll` is created to simulate `#pragma unroll`
+// using template metaprogramming.
+template<template<int i> typename func, int end, int current=0>
+struct static_unroll {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current+1>::with_args(args...);
+  }
+};
+template<template<int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+// helper structs to be used with static_unroll to load arguments
+// one by one
+template<int arg_index>
+struct vectorized_load_helper {
+  template <typename args_t, typename policy_t>
+  static __device__ void apply(policy_t &self, args_t *args, int idx) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
+    auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
+    self.load_single_arg(args_accessor, ptr);
+  }
+};
+template<int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+template <int current>
+struct multi_outputs_store_helper {
+  template<int ntensors, int num_outputs, typename ...Args>
+  C10_HOST_DEVICE static void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      thrust::tuple<Args...> ret) {
+    using T = typename thrust::tuple_element<current, thrust::tuple<Args...>>::type;
+    T *to = reinterpret_cast<T *>(data[current]) + offsets[current];
+    *to = thrust::get<current>(ret);
+  }
+};
+}  // namespace detail
+struct LoadWithoutCast {
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t *>(base_ptr) + offset);
+  }
+};
+template <int N>
+struct LoadWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+  array_t dtypes;
+  size_array_t element_sizes;
+  LoadWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.ninputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i + iter.noutputs());
+      element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs()));
+    }
+  }
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    return c10::fetch_and_cast<scalar_t>(dtypes[arg], ptr);
+  }
+};
+struct StoreWithoutCast {
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+};
+template <int N = 1>
+struct StoreWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+  array_t dtypes;
+  size_array_t element_sizes;
+  StoreWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.noutputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i);
+      element_sizes[i] = c10::elementSize(iter.dtype(i));
+    }
+  }
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    c10::cast_and_store<scalar_t>(dtypes[arg], ptr, value);
+  }
+};
+// aligned vector generates vectorized load/store on CUDA
+template<typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+  scalar_t val[vec_size];
+};
+template <int vec_size, typename scalar_t>
+__device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
+  using vec_t = aligned_vector<scalar_t, vec_size>;
+  auto *from = reinterpret_cast<const vec_t *>(base_ptr);
+  return from[offset];
+}
+template <int vec_size>
+__device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint32_t offset) {
+  // See NOTE [Loading boolean values]
+  auto tmp = load_vector<vec_size>(reinterpret_cast<const uint8_t*>(base_ptr), offset);
+  aligned_vector<bool, vec_size> ret;
+  for (int i = 0; i < vec_size; ++i) {
+    ret.val[i] = bool(tmp.val[i]);
+  }
+  return ret;
+}
+namespace policies {
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int num_outputs = 1>
+struct unroll {
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  loader_t loader;
+  storer_t storer;
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      storer.store(from[i], data[0], offset);
+      thread_idx += num_threads();
+    }
+  }
+};
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+// Note:
+// Functions in vectorized policy does not do boundary check. It assumes the whole block
+// has its job to do. So the reminders should be handled by the caller manually.
+template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
+struct vectorized {
+  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size() / vec_size;
+  data_t data;
+  __device__ vectorized(data_t data) : data(data) {}
+  __device__ inline constexpr bool check_inbounds(int thread_work_elem) {
+    return true;
+  }
+  template<typename accessor_t, typename scalar_t>
+  __device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      auto v = load_vector<vec_size>(from, index);
+      #pragma unroll
+      for (int j = 0; j < vec_size; j++) {
+        to(vec_size * i + j) = v.val[j];
+      }
+    }
+  }
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
+  }
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    using vec_t = aligned_vector<scalar_t, vec_size>;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    vec_t *to_ = reinterpret_cast<vec_t *>(to);
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      vec_t v;
+      for (int j = 0; j < vec_size; j++) {
+        v.val[j] = from[vec_size * i + j];
+      }
+      to_[index] = v;
+    }
+  }
+};
+template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
+  //we don't use inheritance because of compiler bug in cuda 10.2+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+  __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
+  data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+  template <typename return_t>
+  __device__ inline void store(return_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= this->remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      thread_idx += num_threads();
+    }
+  }
+};
+}  // namespace policies
+// This is only used in host, but we will wrap this into some templates
+// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
+// in order to compile
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
+  constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
+  if (address % vec4_alignment == 0) {
+    return 4;
+  } else if (address % vec2_alignment == 0) {
+    return 2;
+  }
+  return 1;
+}
+template<int i>
+struct can_vectorize_up_to_helper {
+  template <typename array_t, typename traits>
+  static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
+    using arg_t = typename traits::template arg<i>::type;
+    // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    result = std::min<int>(result, can_vectorize_up_to<arg_t>(pointers[i + 1]));
+  }
+};
+template<typename func_t, typename array_t>
+inline int can_vectorize_up_to(array_t pointers) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  constexpr int arity = traits::arity;
+  int result = can_vectorize_up_to<return_t>(pointers[0]);
+  // We need to get the type for each argument of `func_t`, this can only
+  // be done at compile time.
+  detail::static_unroll<can_vectorize_up_to_helper, arity>::with_args(result, pointers, traits());
+  return result;
+}
+}}} // namespace at::native::memory

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh ADDED Viewed

	@@ -0,0 +1,379 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <vector>
+namespace at::native {
+namespace {
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
+static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
+    72,
+    60};
+template <typename T>
+__device__ __forceinline__ bool is_aligned(T* p) {
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+template <typename T>
+__device__ __forceinline__ void load_store(
+    T* dst,
+    T* src,
+    int64_t dst_offset,
+    int64_t src_offset) {
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+template <int n>
+struct TensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+template <typename scalar_vals_t, int n>
+struct TensorListScalarListMetadata {
+  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+};
+// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
+// 4kb with `c10::complex<double>`
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 1> {
+  const void* addresses[1]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
+  int block_to_chunk[depth_to_max_blocks[1 - 1]];
+};
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 2> {
+  const void* addresses[2]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
+  int block_to_chunk[depth_to_max_blocks[2 - 1]];
+};
+// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
+// whose each element is `at::Tensor` of 1 element representing the number of
+// `step`s called so far.
+template <int n>
+struct FusedOptimizerTensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+template <typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(kChunkSize, tensorListMeta, args...);
+}
+} // namespace
+// multi_tensor_apply enables horizontal fusion across lists of tensors.
+// For example, whereas you once had a for-loop of a + b = c, where a, b,
+// and c are individual tensors in lists as, bs, and cs, you can now with
+// fewer kernel launches compute as + bs = cs.
+//
+// You can also imagine bs to be a scalar list vs a tensor list.
+//
+// The function below takes in tensor lists, scalars, and a callable and
+// chunks up the computation to launch as few kernels as possible by iterating
+// through every "chunk" in every tensor (thus the nested for loops). In the
+// simplest case, everything gets bundled into just one kernel launch, but
+// due to blocksize constraints, we may need to launch multiple kernels.
+// Each kernel launch is defined by one tensorListMeta construct, which we
+// use to track and reset the necessary metadata for each launch.
+template <int depth, typename scalar_T, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<Scalar> scalars,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  using scalar_vals_t = typename T::opmath_t;
+  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+    // now we enter [chunking territory].
+    // we will launch a kernel when EITHER the blocks get filled up OR
+    // the tensors get filled up. There will always be at least one block
+    // per tensor since the zero-sized ones will not enter the loop, so
+    // the nested forloop within represents iterating through the chunks
+    // of a single tensor.
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+      // a tensor is not considered full unless all its chunks have been
+      // processed
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Reset.
+        loc_block_info = 0;
+        // all chunks have already been handled in the kernel
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else { // blocks were full and tensor chunks remain
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.scalar_vals[0] =
+              tensorListMeta.scalar_vals[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+  // note: [finishing what we started]
+  // if there's remaining work to be done but the tensors/blocks aren't full
+  // yet we are at the end, submit the kernel to do the work!
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  TensorListMetadata<depth> tensorListMeta;
+  tensorListMeta.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+    // see note: [chunking territory].
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+          tensorListMeta.start_tensor_this_launch = t + 1;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+          tensorListMeta.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+  // see note: [finishing what we started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply_for_fused_optimizer(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::TensorList state_steps,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth");
+  const auto num_tensors = tensor_lists[0].size();
+  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.state_steps_addresses[loc_tensor_info] =
+        state_steps[tensor_index].const_data_ptr();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][tensor_index].numel();
+    for (const auto& d : c10::irange(depth)) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][tensor_index].const_data_ptr();
+    }
+    loc_tensor_info++;
+    // see above note: [chunking territory]
+    const auto numel = tensor_lists[0][tensor_index].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    TORCH_CHECK(chunks > -1);
+    for (const auto& chunk : c10::irange(chunks)) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+      const auto tensor_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
+      if (tensor_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.state_steps_addresses[0] =
+              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
+          for (const auto& d : c10::irange(depth)) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+  // see above note: [finishing what we've started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h ADDED Viewed

	@@ -0,0 +1,61 @@

+#pragma once
+#include <ATen/EmptyTensor.h>
+#include <ATen/native/ResizeCommon.h>
+#include <c10/cuda/CUDAGuard.h>
+namespace at { namespace native {
+TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes);
+static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+  const Storage &storage = self->unsafe_storage();
+  TORCH_CHECK(storage, "Tensor: invalid null storage");
+  if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+inline TensorImpl* resize_impl_cuda_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool device_guard = true) {
+  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+    return self;
+  }
+  // NB: We don't need to hold the device guard when calling from TH
+  cuda::OptionalCUDAGuard guard;
+  if (device_guard) {
+    guard.set_index(self->storage().device().index());
+  }
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
+  if (stride) {
+    self->set_sizes_and_strides(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
+  } else {
+    self->set_sizes_contiguous(size);
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
+  }
+  maybe_resize_storage_cuda(self, storage_size);
+  return self;
+}
+}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include <cstdint>
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/cuda/SortStable.h>
+namespace at {
+namespace native {
+inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
+  return self.size(dim) <= 4096;
+}
+void sortKeyValueInplace(
+    const TensorBase &key, const TensorBase &value, int dim,
+    bool descending, bool stable=false);
+}}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh ADDED Viewed

	@@ -0,0 +1,40 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+namespace at {
+namespace native {
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh ADDED Viewed

	@@ -0,0 +1,38 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+namespace at {
+namespace native {
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh ADDED Viewed

	@@ -0,0 +1,40 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+namespace at {
+namespace native {
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh ADDED Viewed

	@@ -0,0 +1,38 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+namespace at {
+namespace native {
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh ADDED Viewed

	@@ -0,0 +1,680 @@

+namespace at {
+namespace cuda {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+  #if defined(__clang__) && defined(__HIP__)
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  // until ROCm support for kernel asserts is restored
+  #define assert(expr) (static_cast<void>(0))
+  #endif
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+  #if defined(__clang__) && defined(__HIP__)
+    return __shfl_down(value, delta, width);
+  #else
+    return __shfl_down_sync(mask, value, delta, width);
+  #endif
+  }
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+  #if defined(__clang__) && defined(__HIP__)
+        __shfl_down(value.real(), delta, width),
+        __shfl_down(value.imag(), delta, width));
+  #else
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  #endif
+  }
+  #endif
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+  int block_width;
+  int block_height;
+  int num_threads;
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+  };
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+  inline __device__ ${functor}
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+    scalar_t values[input_vec_size];
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+    load_t values[vt0];
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+    __syncthreads();
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+  const std::string reduction_template_1 = R"ESCAPE(
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+    __syncthreads();
+    return is_last_block_done_shared;
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+    return value;
+  }
+};
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+}}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <c10/macros/Macros.h>
+// Marks a lambda as executable on both the host and device. The __host__
+// attribute is important so that we can access static type information from
+// the host, even if the function is typically only executed on the device.
+#ifndef GPU_LAMBDA
+#define GPU_LAMBDA __host__ __device__
+#endif
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 256;
+}
+#else
+constexpr uint32_t num_threads() {
+  return C10_WARP_SIZE * 4;
+}
+#endif
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h ADDED Viewed

	@@ -0,0 +1,394 @@

+//  Copyright © 2022 Apple Inc.
+#pragma once
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/TensorFactory.h>
+#include <c10/util/Optional.h>
+#include <c10/core/ScalarType.h>
+#include <torch/library.h>
+#include <exception>
+#include <unordered_map>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+// Fwd declarations
+namespace at {
+  struct TensorIteratorBase;
+}
+using namespace at::mps;
+namespace at::native::mps {
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+struct MPSScalar {
+  id<MTLBuffer> getMTLBuffer() const { return __builtin_bit_cast(id<MTLBuffer>, buffer.get()); }
+  size_t size = 0;
+  ScalarType type = ScalarType::Undefined;
+  c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope)
+  union {
+    float f; // MPS doesn't support 'double'
+    at::Half h;
+    int64_t i;
+    bool b;
+    c10::complex<float> cf;
+    c10::complex<at::Half> ch;
+    at::BFloat16 bf16;
+  } value {};
+};
+void runMPSGraph(MPSStream* mpsStream,
+    MPSGraph* mpsGraph,
+    NSDictionary* feeds,
+    NSDictionary* results);
+MPSDataType getMPSDataType(ScalarType scalar_type);
+static inline MPSDataType getMPSDataType(const Tensor& t) {
+  return getMPSDataType(t.scalar_type());
+}
+MPSDataType getMPSScalarType(ScalarType scalar_type);
+static inline MPSDataType getMPSScalarType(const Tensor& t) {
+  return getMPSScalarType(t.scalar_type());
+}
+MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
+std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false);
+static inline std::string getMPSTypeString(const Tensor& t, bool short_name = false) {
+  return getMPSTypeString(t.scalar_type(), short_name);
+}
+std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
+NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
+std::string getMPSShapeString(MPSShape* shape);
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true);
+std::string getArrayRefString(const IntArrayRef s);
+// use has_storage() on the returned tensor to determine if src actually is a view
+Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+// The MPSShape could vary based on memory format
+MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+static inline id<MTLBuffer> getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+class Placeholder {
+ public:
+  Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr,
+              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
+  MPSGraphTensor* getMPSGraphTensor() {
+    return _placeholder;
+  }
+  MPSGraphTensorData* getMPSGraphTensorData() {
+    return _value;
+  }
+  bool isIntermediate() {
+    return _value == nullptr;
+  }
+ private:
+  MPSGraphTensor* _placeholder;
+  MPSGraphTensorData* _value;
+  Tensor _tensor;
+};
+void resize_tensor(Tensor* output);
+Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device);
+MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, MPSDataType toType);
+MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
+MPSGraph* make_mps_graph();
+void printTensorNDArray(const Tensor& t);
+MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType);
+MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar);
+string get_mem_format_string(c10::MemoryFormat memory_format);
+using MPSCacheKey = uint64_t;
+// derive this class to cache a graph and its inputs/outputs
+// can be used to store any NSObject
+struct MPSCachedGraph
+{
+  MPSCachedGraph(NSObject *object) : _object([object retain]) {}
+  virtual ~MPSCachedGraph() {
+   [_object release];
+   _object = nullptr;
+  }
+  template<typename T>
+  inline T* as() {
+    return static_cast<T*>(this);
+  }
+  MPSGraph *graph() const { return (MPSGraph *)_object; }
+  NSObject *object() const { return _object; }
+private:
+  NSObject *_object = nullptr;
+};
+struct MPSUnaryCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+struct MPSUnaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil; // some backward input is actually the forward's output
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+struct MPSBinaryCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+struct MPSBinaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+// TODO: Improve the overall design of MPSGraphCache.
+// https://github.com/pytorch/pytorch/issues/77176
+// Cache holding various keys mapped to graphs
+struct MPSGraphCache
+{
+  typedef MPSCachedGraph * (^CreateCachedGraphBlock)();
+  struct CacheEntry {
+    CacheEntry(const std::string& key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {}
+    MPSCachedGraph* cachedGraph_ = nullptr;
+    std::string key_;
+  };
+ public:
+  static MPSGraphCache* getInstance() {
+    if(_instance_cache == nullptr) {
+      _instance_cache = new MPSGraphCache();
+    }
+    return _instance_cache;
+  }
+  ~MPSGraphCache() {
+    dispatch_release(serialQueue_);
+    for (const auto& i : cache_) {
+      delete i.second.cachedGraph_;
+    }
+  }
+  // Disallow the copy constructor and operator= functions
+  MPSGraphCache(const MPSGraphCache&) = delete;
+  void operator=(const MPSGraphCache&) = delete;
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    __block MPSCachedGraph* cachedGraph = nil;
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+    dispatch_sync_with_rethrow(serialQueue_, ^() {
+      // verify the cached entry doesn't already exist
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+      } else {
+        cachedGraph = createCacheBlock();
+        CacheEntry entry(key, cachedGraph);
+        cache_.emplace(hash, entry);
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+  template<typename T>
+  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock));
+  }
+  MPSCachedGraph* LookUp(const std::string& key) const {
+    __block MPSCachedGraph* cachedGraph = nullptr;
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+    dispatch_sync(serialQueue_, ^() {
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+  template<typename T>
+  inline T* LookUpAs(const std::string& key) const {
+    return static_cast<T *>(LookUp(key));
+  }
+ private:
+  MPSGraphCache() {
+    serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
+  }
+  // this is defined in OperationUtils.mm to not include
+  // MPSProfiler.h in header OperationUtils.h
+  void profileCachedGraph(const CacheEntry& cacheEntry) const;
+  static MPSGraphCache* _instance_cache;
+  std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+  dispatch_queue_t serialQueue_ = nullptr;
+};
+// Common template for creating graph with a specified cache if missing
+template<typename T>
+inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(MPSGraph*, T*)> instantiate) {
+  auto cache_ = MPSGraphCache::getInstance();
+  if (auto rc  = cache_->LookUpAs<T>(key)) {
+    return rc;
+  }
+  return cache_->CreateCachedGraphAs<T>(key, ^mps::MPSCachedGraph*() {
+    T* newCachedGraph = nil;
+    @autoreleasepool {
+      // Initialize graph
+      auto mpsGraph = mps::make_mps_graph();
+      newCachedGraph = new T(mpsGraph);
+      instantiate(mpsGraph, newCachedGraph);
+    }
+    return newCachedGraph;
+  });
+}
+// Common math operations
+MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name)                                           \
+  if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) {                                                       \
+     TORCH_WARN_ONCE("MPS: no support for int64 for ", op_name,                                                         \
+     ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3.");   \
+  }
+/**
+ * Returns distance from lowest to highest element offset in given tensor.
+ */
+size_t compute_storage_numel_distance(const at::Tensor& t);
+/**
+ * Checks whether tensor is mapped to a contiguous area in the storage.
+ */
+inline bool is_dense_in_storage(const at::Tensor& t) {
+  return compute_storage_numel_distance(t) == static_cast<size_t>(t.numel());
+}
+static inline void mtl_setBuffer(id<MTLComputeCommandEncoder> encoder, const Tensor& t, unsigned idx) {
+  [encoder setBuffer:getMTLBufferStorage(t)
+              offset:t.storage_offset() * t.element_size()
+             atIndex:idx];
+}
+static inline void mtl_dispatch1DJob(id<MTLComputeCommandEncoder> encoder,
+                                     id<MTLComputePipelineState> cplState,
+                                     uint32_t length) {
+  const uint32_t maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup];
+  auto size = MTLSizeMake(length, 1, 1);
+  auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1);
+  [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize];
+}
+id<MTLBuffer> generateKernelDataOffsets(id<MTLComputeCommandEncoder> commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false);
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) {
+        return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() };
+}
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+         };
+}
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+         };
+}
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+                p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(),
+         };
+}
+inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) {
+        runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
+}
+inline bool supportsComplex() {
+  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+}
+// MPS yet to support double types, but starting from MacOS 14, supports bfloat16
+inline bool supportedFloatingType(ScalarType dtype) {
+  return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
+}
+inline bool supportedFloatingType(const Tensor& t) {
+  return supportedFloatingType(t.scalar_type());
+}
+} // namespace at::native::mps

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h ADDED Viewed

	@@ -0,0 +1,12 @@

+//  Copyright © 2022 Apple Inc.
+#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(                                                   \
+      TYPE, NAME,                                                       \
+      AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)                \
+      AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__))

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h ADDED Viewed

	@@ -0,0 +1,103 @@

+/**
+ * Transformer-specific NestedTensor utility functions.
+ *
+ * Not co-located with NestedTensor core code yet because they only
+ * support specific cases needed in transformers.
+ */
+#pragma once
+#include <vector>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+namespace c10 {
+class Scalar;
+} // namespace c10
+namespace at {
+class Tensor;
+namespace native {
+struct NestedTensorImpl;
+// Requires that self is a contiguous NestedTensor, other is not a
+// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self
+// must have a consistent last dimension across its included Tensors
+// and that dimension must match other.size(0).
+Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other);
+// Requires that mat1 is a contiguous NestedTensor, self & mat2 are
+// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1
+// has a consistent last dimension across its included Tensors that
+// matches mat2.size(0).
+Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::Scalar& beta,
+    const c10::Scalar& alpha,
+    c10::optional<bool> use_gelu = c10::nullopt);
+Tensor NestedTensor_add_NestedTensor_in_place(
+    const Tensor& self,
+    const Tensor& other);
+TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor(
+    const Tensor& sizes,
+    int64_t extra_elements);
+Tensor NestedTensor_from_padded_tensor_cpu(
+    const Tensor& padded,
+    const NestedTensorImpl& nt);
+Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length);
+template <typename T>
+void remove_padding_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+template <typename T>
+void remove_padding_transform0213_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+TORCH_API Tensor flash_attention_helper(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+TORCH_API std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h ADDED Viewed

	@@ -0,0 +1,130 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+namespace at {
+namespace native {
+Tensor& quantize_tensor_per_tensor_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& quantize_tensor_per_channel_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& quantize_tensor_per_channel_float_qparams(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& dequantize_tensor_per_tensor_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& dequantize_tensor_per_channel_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& dequantize_tensor_per_channel_float_qparams(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+using quantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point);
+using quantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+using quantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+using dequantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point);
+using dequantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point);
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point);
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_fn,
+    quantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_affine_fn,
+    quantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_float_qparams_fn,
+    quantize_tensor_per_channel_float_qparams_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_fn,
+    dequantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_affine_fn,
+    dequantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_float_qparams_fn,
+    dequantize_tensor_per_channel_float_qparams_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
+template <typename T>
+TORCH_API Tensor quantize_tensor(
+    Tensor rtensor,
+    Tensor qtensor,
+    double scale,
+    int64_t zero_point);
+template <typename T>
+TORCH_API Tensor dequantize_tensor(
+    Tensor qtensor,
+    Tensor rtensor,
+    double scale,
+    int64_t zero_point);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h ADDED Viewed

	@@ -0,0 +1,62 @@

+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/native/ConvUtils.h>
+namespace at::native::quantized {
+namespace {
+// MakeConvOutputShape used from both CPU and CUDA libraries
+// and exporting symbol from torch_cpu would probably take more storage
+// than duplicating implementation which likely be inlined away
+template <int kSpatialDim>
+at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, kSpatialDim>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation);
+#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK)
+template <>
+at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 2>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const at::List<int64_t>& dilation) {
+  const int H = input_image_shape[0];
+  const int W = input_image_shape[1];
+  const int64_t Y_H =
+      (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  return {N, M, Y_H, Y_W};
+}
+template <>
+at::SmallVector<int64_t, 5> MakeConvOutputShape<3>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 3>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation) {
+  const int D = input_image_shape[0];
+  const int H = input_image_shape[1];
+  const int W = input_image_shape[2];
+  const int64_t Y_D =
+      (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_H =
+      (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1;
+  return {N, M, Y_D, Y_H, Y_W};
+}
+#endif
+} // anonymous namespace
+} // namespace at::native::quantized

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <ATen/native/TensorIterator.h>
+namespace at {
+namespace native {
+using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point);
+using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point);
+DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub);
+DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub);
+} // native
+} // at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h ADDED Viewed

	@@ -0,0 +1,147 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  // out variant of LinearPackedParamsBase::apply
+  virtual at::Tensor& apply_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_relu_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    input: float32 Tensor, will be quantized to quint8 in the op
+  // Returns:
+  //    float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+  virtual at::Tensor apply_dynamic(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor apply_dynamic_relu(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor& apply_dynamic_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+  virtual c10::optional<at::Tensor> bias() = 0;
+  virtual void set_bias(c10::optional<at::Tensor> /*bias*/) {
+    throw std::runtime_error(
+        "set_bias is not implemented for this packed "
+        "parameter type");
+  }
+};
+template <int kSpatialDim = 2>
+struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) = 0;
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+  virtual torch::List<int64_t> stride() const = 0;
+  virtual torch::List<int64_t> padding() const = 0;
+  virtual torch::List<int64_t> output_padding() const = 0;
+  virtual torch::List<int64_t> dilation() const = 0;
+  virtual int64_t groups() const = 0;
+  virtual bool transpose() const = 0;
+};

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h ADDED Viewed

	@@ -0,0 +1,29 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor embeddingbag_byte(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+  virtual at::Tensor embeddingbag_4bit(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+  virtual at::Tensor unpack() = 0;
+  virtual int64_t bit_rate() const = 0;
+  virtual int64_t version() const = 0;
+};

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h ADDED Viewed

	@@ -0,0 +1,527 @@

+#pragma once
+#ifdef USE_PYTORCH_QNNPACK
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <pytorch_qnnpack.h>
+#include <qnnpack_func.h>
+#include <ATen/native/quantized/cpu/XnnpackUtils.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/utils/Factory.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+#include <utility>
+inline int kPaddingChannels = 8;
+struct QnnpackOperatorDeleter {
+  void operator()(pytorch_qnnp_operator_t op) {
+    pytorch_qnnp_delete_operator(op);
+  }
+};
+// PackedWeight struct for QNNPACK stores the original Weight and Bias as
+// QNNPACK currently does not support an unpack function.
+// For PyTorch Mobile, once the model is scripted and serialized we don't need
+// to call unpack, so we can save some memory by checking for this case and free
+// the original weights after packing.
+// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized
+// with input scale which is available at runtime in pytorch. During runtime if
+// input scale value changes then we requantize bias with the updated scale. For
+// inference we expect the graph to be static so the input scale should not
+// change across consecutive inference calls.
+struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
+  PackedLinearWeightsQnnp(
+      std::unique_ptr<qnnpack::PackBMatrix> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      c10::optional<double> input_scale,
+      at::Tensor w_scales,
+      std::vector<uint8_t>&& w_zps)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
+            bias, bias.suggest_memory_format())),
+        per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
+        input_scale(std::move(input_scale)),
+        w_scales(std::move(w_scales)),
+        w_zero_points(std::move(w_zps)),
+        q_scheme(this->orig_weight.qscheme()) {
+    weight_sizes = this->orig_weight.sizes().vec();
+  }
+  std::unique_ptr<qnnpack::PackBMatrix> w;
+  at::Tensor orig_weight;
+  at::Tensor bias_;
+  bool per_channel_;
+  c10::optional<double> input_scale;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  std::vector<int64_t> weight_sizes;
+  c10::QScheme q_scheme;
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+  bool per_channel() const {
+    return per_channel_;
+  }
+ private:
+  std::mutex qnnp_mutex_;
+#ifdef USE_XNNPACK
+  xnnpack_operator xnnp_linear_op;
+  template <typename scalar_t, bool kReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range);
+};
+template <int kSpatialDim = 2>
+struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsQnnp(
+      std::unique_ptr<qnnpack::PrePackConvWeights> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose,
+      c10::optional<double> input_scale,
+      std::vector<int64_t> kernel,
+      at::Tensor w_scale,
+      std::vector<uint8_t>&& w_zps,
+      bool is_per_channel)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        is_per_channel_(is_per_channel),
+        input_scale(input_scale),
+        kernel_(std::move(kernel)),
+        w_scales(std::move(w_scale)),
+        w_zero_points(std::move(w_zps)) {
+    const bool any_padding = std::any_of(
+        padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; });
+    const size_t kernel_size =
+        std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>());
+    const size_t group_input_channels = transpose
+        ? this->orig_weight.size(0) / groups
+        : this->orig_weight.size(1);
+    const size_t group_output_channels = transpose
+        ? this->orig_weight.size(1)
+        : this->orig_weight.size(0) / groups;
+    const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1;
+    const size_t kernel_height = kernel_[kSpatialDim - 2];
+    const size_t kernel_width = kernel_[kSpatialDim - 1];
+    pytorch_qnnp_ukernel_type ukernel_type;
+    if (transpose_) {
+      ukernel_type = pytorch_qnnp_ukernel_type_conv;
+    } else {
+      ukernel_type = pytorch_qnnp_ukernel_type_none;
+      const bool has_depthwise_dimensions =
+          (kSpatialDim == 2 &&
+           ((kernel_height == 3 && kernel_width == 3) ||
+            (kernel_height == 5 && kernel_width == 5))) ||
+          (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 &&
+           kernel_depth == 3);
+      const bool has_depthwise_grouping =
+          group_input_channels == 1 && group_output_channels == 1 && groups > 1;
+      if (has_depthwise_dimensions && has_depthwise_grouping) {
+        ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
+      } else if (
+          kernel_size == 1 &&
+          std::all_of(
+              stride_.begin(),
+              stride_.end(),
+              [](const auto& e) { return e == 1; }) &&
+          !any_padding) {
+        ukernel_type = group_input_channels >= SIZE_MAX
+            ? pytorch_qnnp_ukernel_type_xzp_gemm
+            : pytorch_qnnp_ukernel_type_gemm;
+      } else {
+        ukernel_type = pytorch_qnnp_ukernel_type_conv;
+      }
+    }
+    if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
+      TORCH_INTERNAL_ASSERT(
+          false, "Per channel quantized weights are not supported for XZP kernels");
+    }
+    pytorch_qnnp_operator_t convolution{nullptr};
+    // Initially all the params are set to zero.
+    convolution = static_cast<pytorch_qnnp_operator_t>(
+        calloc(1, sizeof(struct pytorch_qnnp_operator)));
+    if (convolution == nullptr) {
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+          sizeof(struct pytorch_qnnp_operator));
+    }
+    convolution_op =
+        std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>(
+            convolution);
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    convolution->ukernel_type = ukernel_type;
+    convolution->groups = groups;
+    convolution->group_input_channels = group_input_channels;
+    convolution->group_output_channels = group_output_channels;
+    convolution->kernel_depth = kernel_depth;
+    convolution->kernel_height = kernel_height;
+    convolution->kernel_width = kernel_width;
+    convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1;
+    convolution->stride_height = stride_[kSpatialDim - 2];
+    convolution->stride_width = stride_[kSpatialDim - 1];
+    convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
+    convolution->dilation_height = dilation_[kSpatialDim - 2];
+    convolution->dilation_width = dilation_[kSpatialDim - 1];
+    convolution->input_padding_height = padding_[kSpatialDim - 2];
+    convolution->input_padding_width = padding_[kSpatialDim - 1];
+    convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
+    convolution->per_channel = is_per_channel_;
+    convolution->transpose = transpose_;
+    const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
+    const size_t k_stride = (group_input_channels + (kr - 1)) & -kr;
+    size_t zero_size = sizeof(uint8_t) * k_stride;
+    size_t zero_offset = 0;
+    if (transpose_) {
+      convolution->adjustment_width = output_padding_[1];
+      convolution->adjustment_height = output_padding_[0];
+      if (group_input_channels < 8) {
+        zero_size += 8;
+        zero_offset = 8;
+      }
+    } else {
+      zero_buffer_size = 0;
+      if (any_padding) {
+        zero_size = 0;
+        zero_offset = 0;
+        if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) {
+          const uint32_t cr = pytorch_qnnp_params.q8dw9.cr;
+          const size_t group_stride = (groups + (cr - 1)) & -cr;
+          if (groups >= 8) {
+            zero_size = sizeof(uint8_t) * group_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * group_stride + 8;
+            zero_offset = sizeof(uint8_t) * 8;
+          }
+        } else if (
+            ukernel_type == pytorch_qnnp_ukernel_type_conv ||
+            ukernel_type == pytorch_qnnp_ukernel_type_gemm) {
+          if (group_input_channels >= 8) {
+            zero_size = sizeof(uint8_t) * k_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * k_stride + 8;
+            zero_offset = 8;
+          }
+        }
+      }
+    }
+    // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI)
+    void* zero_buffer = malloc(zero_size);
+    if (zero_buffer == nullptr) {
+      pytorch_qnnp_delete_operator(convolution);
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for zero padding",
+          zero_size);
+    }
+    // Need to set to input zero point
+    // memset(zero_buffer, input_zero_point, zero_size);
+    zero_buffer_size = zero_size;
+    convolution->zero_buffer = zero_buffer;
+    convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
+  }
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
+  #ifdef USE_XNNPACK
+  xnnpack_operator xnnp_convolution_op;
+  #endif  // USE_XNNPACK
+  std::unique_ptr<qnnpack::PrePackConvWeights> w;
+  at::Tensor orig_weight;
+  at::Tensor bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+  bool is_per_channel_;
+  c10::optional<double> input_scale;
+  std::vector<int64_t> kernel_;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  size_t zero_buffer_size;
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range=false) override;
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+  int64_t groups() const override {
+    return groups_;
+  }
+  bool transpose() const override {
+    return transpose_;
+  }
+  bool per_channel() const {
+    return is_per_channel_;
+  }
+ private:
+  std::mutex qnnp_mutex_;
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#ifdef USE_XNNPACK
+  template <typename scalar_t, bool ReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+};
+enum class Activation : uint8_t { NONE = 0, RELU = 1 };
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+  return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+  return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+  return std::nearbyint(x);
+}
+#endif
+template<typename T>
+inline T QuantizeValue(float scale, int32_t zero_point, float value) {
+  const int32_t qmin = std::numeric_limits<T>::min();
+  const int32_t qmax = std::numeric_limits<T>::max();
+  auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<T>(r);
+}
+template<typename T>
+inline std::pair<T, T> activationLimits(
+    float scale,
+    int32_t zero_point,
+    Activation Ac) {
+  switch (Ac) {
+    case Activation::NONE:
+      return {std::numeric_limits<T>::min(),
+              std::numeric_limits<T>::max()};
+    case Activation::RELU:
+      return {QuantizeValue<T>(scale, zero_point, 0.0),
+              std::numeric_limits<T>::max()};
+    default:
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+  }
+}
+namespace at {
+namespace native {
+namespace qnnp_avgpool_helper {
+Tensor qnnpack_avg_pool2d(
+    Tensor input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+} // qnnp_avgpool_helper
+} // namespace native
+} // namespace at
+namespace {
+C10_UNUSED std::vector<float> generate_requantization_scales(
+    const at::Tensor& weight_scales,
+    const float input_scale,
+    const float output_scale,
+    std::vector<float>& requant_scales) {
+  // Since weight scale is allocated with padding
+  // weight_scales.numel() gives us padded num elements.
+  const auto num_output_channels_padded = weight_scales.numel();
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (static_cast<int64_t>(requant_scales.size()) < num_output_channels_padded) {
+    requant_scales.resize(num_output_channels_padded);
+  }
+  for (const auto i : c10::irange(num_output_channels_padded)) {
+    const auto inverse_output_scale = 1.f /output_scale;
+    requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale;
+    TORCH_CHECK(
+        (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])),
+        "failed to create op with requantization scale: ",
+        requant_scales[i],
+        ": requantization scale must be finite and positive");
+  }
+  return requant_scales;
+}
+C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+    const at::Tensor& weight_contig,
+    bool transpose = false,
+    uint32_t groups = 1
+  ) {
+  const int out_ch_idx = transpose ? 1 : 0;
+  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
+  // Add 8 to account for bufferring needed by QNNPACK.
+  const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
+  const auto qtype = weight_contig.qscheme();
+  std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
+  // Adjust weight zero point, similar to weight data.
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128);
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong,
+        "Per channel zero points dtype must be long int.");
+    const int64_t* per_channel_zero_points =
+      weight_contig.q_per_channel_zero_points().data_ptr<int64_t>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  at:: Tensor weight_scales =
+    at::empty(
+        {num_output_channels_padded},
+        at::device(at::kCPU).dtype(at::kFloat));
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = weight_contig.q_scale();
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_scales().scalar_type() == at::kDouble,
+        "Per channel scales dtype must be double.");
+    const double *const per_channel_scales =
+      weight_contig.q_per_channel_scales().data_ptr<double>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = static_cast<float>(per_channel_scales[i]);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) {
+    weight_scales_data[i] = 1.f;
+  }
+  return {weight_zp, weight_scales};
+}
+} // namespace
+#endif