koichi12 commited on Feb 12, 2025

Commit

a8eed2c

verified ·

1 Parent(s): 49fc886

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 +3 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/comm_analysis.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/comms.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/exc.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/graph.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/index_propagation.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/optimize_indexing.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/comms.py +363 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/config.py +752 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/constant_folding.py +264 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/graph.py +1324 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/ir.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/pattern_matcher.py +1524 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/scheduler.py +2445 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py +1156 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/sizevars.py +643 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/utils.py +1428 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/wrapper_benchmark.py +299 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DimVector.h +2 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dimname.h +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DynamicLibrary.h +34 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Formatting.h +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MetaFunctions_inl.h +324 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorSubclassLikeUtils.h +86 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/ATenCUDAGeneral.h +9 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAContext.h +9 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADataType.h +115 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Exceptions.h +174 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h +85 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h +189 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h +263 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h +20 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h +229 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h +371 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h +18 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h +88 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h +130 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h +37 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h +12 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h +21 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Intrinsics.h +33 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h +394 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h +14 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h +238 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h +22 -0

.gitattributes CHANGED Viewed

@@ -76,3 +76,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ModuleNode.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text

 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ModuleNode.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26a7288b7315d658acab1073f02c4f18cd1d27eeadde102958f0317dad6656e0
+size 150200

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/comm_analysis.cpython-311.pyc ADDED Viewed

Binary file (8.74 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/comms.cpython-311.pyc ADDED Viewed

Binary file (18.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-311.pyc ADDED Viewed

Binary file (6.13 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/exc.cpython-311.pyc ADDED Viewed

Binary file (7.37 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/graph.cpython-311.pyc ADDED Viewed

Binary file (67.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/index_propagation.cpython-311.pyc ADDED Viewed

Binary file (18.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/optimize_indexing.cpython-311.pyc ADDED Viewed

Binary file (4.85 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/comms.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# pyre-strict
+from typing import List
+import torch
+from . import config, ir, scheduler
+from .dependencies import WeakDep
+from .utils import tuple_sorted
+overlap_log = torch._logging.getArtifactLogger(__name__, "overlap")
+def sink_waits(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Greedily moves waits as late as possible (i.e. until we reach a use). Optimal in terms of
+    communication overlap.
+    """
+    new_order = []
+    cur_waits = set()
+    for snode in snodes:
+        if isinstance(snode.node, ir.Wait):
+            cur_waits.add(snode)
+        else:
+            for wait in tuple_sorted(cur_waits):
+                if snode in wait.node_users:
+                    new_order.append(wait)
+                    cur_waits.remove(wait)
+            new_order.append(snode)
+    new_order.extend(tuple_sorted(cur_waits))
+    return new_order
+def raise_comms(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Greedily moves comms as early as possible (i.e. until we reach an input).
+    Optimal in terms of communication overlap.
+    TODO: We might want to adjust this in the future to account for memory limitations.
+    e.g. when we are compiling FSDP, this heuristics will cause the all-gathers to be prefetched as soon as possible,
+    which is the beginning of the forwards pass. We'll have to either do a special pass for FSDP,
+    or we'll want to redo this pass with memory considerations so we handle the FSDP case in a general way.
+    """
+    new_order_reversed: List["scheduler.BaseSchedulerNode"] = []
+    cur_comms: List["scheduler.BaseSchedulerNode"] = []
+    for snode in reversed(snodes):
+        if isinstance(snode.node, ir.CollectiveKernel):
+            cur_comms.append(snode)
+        else:
+            for comm in cur_comms:
+                assert len(comm.inverse_users) > 0
+            while len(cur_comms) > 0 and any(
+                snode in comm.inverse_users for comm in cur_comms
+            ):
+                comm = cur_comms.pop(0)
+                new_order_reversed.append(comm)
+            new_order_reversed.append(snode)
+    assert len(cur_comms) <= 1
+    new_order_reversed.extend(tuple_sorted(cur_comms))
+    return new_order_reversed[::-1]
+def get_ancestors(node):
+    ancestors = set()
+    cur_nodes = [node]
+    while len(cur_nodes) > 0:
+        new_nodes = []
+        for node in cur_nodes:
+            for inp in node.inverse_users:
+                if inp not in ancestors:
+                    ancestors.add(inp)
+                    new_nodes.append(inp)
+        cur_nodes = new_nodes
+    return ancestors
+def get_descendants(node):
+    descendants = set()
+    cur_nodes = [node]
+    while len(cur_nodes) > 0:
+        new_nodes = []
+        for node in cur_nodes:
+            for inp in node.node_users:
+                if inp not in descendants:
+                    descendants.add(inp)
+                    new_nodes.append(inp)
+        cur_nodes = new_nodes
+    return descendants
+def decide_global_ordering_of_comms(nodes: List["scheduler.BaseSchedulerNode"]):
+    """
+    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
+    (might not be the same ordering as the eager mode program).
+    TODO: Come up with a better approach
+    """
+    comm_nodes = [n for n in nodes if isinstance(n.node, ir.CollectiveKernel)]
+    for i in range(1, len(comm_nodes)):
+        # Enforce ordering by making previous comm a `WeakDep` dependency of the next comm
+        comm_nodes[i].add_fake_dep(WeakDep(comm_nodes[i - 1].get_name()))
+def assert_no_comm_nodes(snodes: List["scheduler.BaseSchedulerNode"]) -> None:
+    assert not any(isinstance(snode.node, ir.CollectiveKernel) for snode in snodes)
+def estimate_op_runtime(snode: "scheduler.BaseSchedulerNode") -> float:
+    """
+    Returns estimated op runtime in nanoseconds (ns)
+    """
+    if config.estimate_op_runtime == "default":
+        runtime = snode.get_estimated_runtime()
+    else:
+        assert callable(config.estimate_op_runtime)
+        runtime = config.estimate_op_runtime(snode)
+    return runtime
+def reorder_compute_for_overlap(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Decides a global ordering of all compute and communication nodes,
+    assuming that we already have a global ordering of communication nodes.
+    Overall scheduling procedure is:
+        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
+            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
+        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
+            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
+            We prioritize compute nodes that are needed sooner.
+        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
+        Step 4: We schedule comm N + 1.
+        Repeat this for subsequent comm nodes.
+    """
+    final_order = []
+    comm_nodes = []
+    for snode in snodes:
+        if isinstance(snode.node, ir.CollectiveKernel):
+            comm_nodes.append(snode)
+    if len(comm_nodes) == 0:
+        # if there is no comm nodes, return the current order
+        return snodes
+    comm_ancestors = {node: get_ancestors(node) for node in comm_nodes}
+    comm_descendants = {node: get_descendants(node) for node in comm_nodes}
+    indeg = dict.fromkeys(snodes, 0)
+    for snode in snodes:
+        for user in snode.node_users:
+            if user in indeg:
+                indeg[user] += 1
+    ready_to_schedule_nodes = {node for node in snodes if indeg[node] == 0}
+    unscheduled_nodes = set()
+    unscheduled_nodes = set(snodes)
+    def schedule_node(snode):
+        """
+        Schedule a single node.
+        """
+        assert snode in unscheduled_nodes
+        assert snode in ready_to_schedule_nodes
+        ready_to_schedule_nodes.remove(snode)
+        unscheduled_nodes.remove(snode)
+        final_order.append(snode)
+        for user in tuple_sorted(snode.node_users):
+            if user in indeg:
+                indeg[user] -= 1
+                if indeg[user] == 0:
+                    ready_to_schedule_nodes.add(user)
+    def schedule_nodes(snodes):
+        """
+        Schedules all nodes in `snodes` in an arbitrary topologically valid order.
+        """
+        all_nodes = set(snodes)
+        assert all(node in unscheduled_nodes for node in all_nodes)
+        while len(all_nodes) > 0:
+            # NOTE: since model graph is always a DAG and does not have circular dependency inside,
+            # there should be at least one node that is a "free node" (i.e. indeg == 0),
+            # hence infinite loop is not possible. But we check here just to be safe.
+            progress = False
+            for node in tuple_sorted(all_nodes):
+                if node in ready_to_schedule_nodes:
+                    schedule_node(node)
+                    all_nodes.remove(node)
+                    progress = True
+            if not progress:
+                raise Exception(
+                    "Unable to find a free node (indeg == 0). This is an impossible state to reach. "
+                    "Please report a bug to PyTorch."
+                )
+    # First, schedule all compute nodes that are required by first comm node,
+    # as well as the first comm node itself.
+    assert len(comm_nodes) > 0
+    schedule_nodes(
+        list(comm_ancestors[comm_nodes[0]]) + [comm_nodes[0]],
+    )
+    rolled_over_compute_cost = 0
+    for idx in range(1, len(comm_ancestors)):
+        # Step 1: Given that we've currently scheduled comm `idx-1`, we now schedule
+        # all compute nodes that are required for comm `idx` but do not depend on comm `idx-1`,
+        # to run at the same time with comm `idx-1`.
+        needed_by_next_comm_and_ready_compute_nodes = unscheduled_nodes & (
+            comm_ancestors[comm_nodes[idx]] - comm_descendants[comm_nodes[idx - 1]]
+        )
+        assert_no_comm_nodes(needed_by_next_comm_and_ready_compute_nodes)
+        total_compute_runtime_cost = rolled_over_compute_cost + sum(
+            [
+                estimate_op_runtime(node)
+                for node in needed_by_next_comm_and_ready_compute_nodes
+            ]
+        )
+        prev_comm_runtime_cost = estimate_op_runtime(comm_nodes[idx - 1])
+        schedule_nodes(tuple_sorted(needed_by_next_comm_and_ready_compute_nodes))
+        # Step 2: If all those compute nodes are sufficient to overlap comm `idx-1`, we're done.
+        # Otherwise, we now need to look elsewhere to find compute that overlaps with comm `idx`.
+        # We prioritize compute nodes that are needed sooner.
+        step1_runtime_cost = total_compute_runtime_cost
+        if step1_runtime_cost >= prev_comm_runtime_cost:
+            pass
+        else:
+            # Find all ready to schedule compute nodes that do not depend on comm `idx-1`.
+            ready_to_schedule_compute_nodes = tuple_sorted(
+                ready_to_schedule_nodes - comm_descendants[comm_nodes[idx - 1]]
+            )
+            assert_no_comm_nodes(ready_to_schedule_compute_nodes)
+            def earliest_comm_descendant(node):
+                for idx in range(len(comm_nodes)):
+                    if node in comm_ancestors[comm_nodes[idx]]:
+                        return idx
+                return len(comm_nodes)
+            # Prioritize compute nodes that are needed sooner.
+            ready_to_schedule_compute_nodes = sorted(
+                ready_to_schedule_compute_nodes, key=earliest_comm_descendant
+            )
+            for snode in ready_to_schedule_compute_nodes:
+                if total_compute_runtime_cost >= prev_comm_runtime_cost:
+                    # If accumulated compute runtime cost is greater than comm `idx-1` runtime cost,
+                    # it means we have maximized overlap for comm `idx-1`, and hence we stop looking
+                    # for more compute to schedule.
+                    break
+                compute_runtime_cost = estimate_op_runtime(snode)
+                # If we're not able to leverage more than half of this
+                # node's compute to overlap, we skip it.
+                # TODO: Smarter heuristics here
+                if (
+                    prev_comm_runtime_cost - total_compute_runtime_cost
+                ) <= compute_runtime_cost / 2:
+                    continue
+                schedule_node(snode)
+                total_compute_runtime_cost += compute_runtime_cost
+        rollable_compute_cost = total_compute_runtime_cost - step1_runtime_cost
+        # Step 3: We schedule the compute nodes dependent on comm `idx-1` and required for comm `idx`.
+        needed_by_next_comm_nodes = unscheduled_nodes & comm_ancestors[comm_nodes[idx]]
+        schedule_nodes(list(needed_by_next_comm_nodes))
+        # Step 4: We schedule comm `idx`.
+        schedule_nodes([comm_nodes[idx]])
+        is_prev_comm_blocking_next_comm = len(needed_by_next_comm_nodes) > 0
+        # The idea here is that if there are no compute nodes from Step 3
+        # (i.e. if prev comm is not blocking next comm), we can roll over the compute nodes
+        # in Step 2 to overlap with the next comm, since they're not required to finish
+        # before the next comm starts.
+        if is_prev_comm_blocking_next_comm:
+            rolled_over_compute_cost = 0
+        else:
+            rolled_over_compute_cost = rollable_compute_cost  # type: ignore[assignment]
+    schedule_nodes(unscheduled_nodes)
+    return final_order
+def node_summary(snode):
+    detail = ""
+    if isinstance(snode.node, ir.ExternKernelOut):
+        detail = f" ({snode.node.python_kernel_name})"
+    out_tensor_info = ""
+    if (
+        hasattr(snode.node, "layout")
+        and hasattr(snode.node.layout, "size")
+        and hasattr(snode.node.layout, "stride")
+    ):
+        out_tensor_info = (
+            f" (size={snode.node.layout.size}, stride={snode.node.layout.stride})"
+        )
+    node_name = ""
+    if hasattr(snode.node, "name"):
+        node_name = snode.node.name
+    return f"{snode.node.__class__.__name__}{detail}{out_tensor_info} ({node_name})"
+def visualize_overlap(order):
+    total_est_runtime: float = 0.0
+    cur_comm_node = None
+    for snode in order:
+        if cur_comm_node is None:
+            if isinstance(snode.node, ir.CollectiveKernel):
+                total_est_runtime += estimate_op_runtime(snode)
+                cur_comm_node = snode.node
+            elif isinstance(snode.node, ir.Wait):
+                raise Exception(
+                    "Wait is not expected when there is no collective running"
+                )
+            else:  # exposed compute op
+                total_est_runtime += estimate_op_runtime(snode)
+            overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+        else:  # cur_comm_node is not None
+            if isinstance(snode.node, ir.CollectiveKernel):
+                raise Exception(
+                    "Found two collectives running at the same time. "
+                    "`visualize_overlap` needs to be updated to handle this case"
+                )
+            elif isinstance(snode.node, ir.Wait):  # end of this comm op
+                overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+                cur_comm_node = None
+            else:  # overlapped compute op
+                overlap_log.debug(f"| {node_summary(snode)}")  # noqa: G004
+    overlap_log.debug(
+        f"Est. runtime (ms): {total_est_runtime / 1000 / 1000}"  # noqa: G004
+    )
+def reorder_compute_and_comm_for_overlap(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    order = snodes
+    for p in config.reorder_for_compute_comm_overlap_passes:
+        if isinstance(p, str) and p in globals():
+            p = globals()[p]  # it is a builtin pass
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap before reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+        order = p(order)  # type: ignore[operator]
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap after reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+    return order

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/config.py ADDED Viewed

	@@ -0,0 +1,752 @@

+import os  # noqa: C101
+import sys
+from typing import Any, Callable, Dict, Optional, TYPE_CHECKING
+import torch
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+# add some debug printouts
+debug = False
+# add inf and NaN checkers
+debug_check_inf_and_nan = False
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+# Whether to enable printing the source code for each future
+verbose_progress = False
+# use fx aot graph codegen cache
+fx_graph_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE") == "1"
+# use cpp wrapper instead of python wrapper
+cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
+# codegen cpp wrapper code in an ABI compatible mode
+abi_compatible = (
+    os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
+)
+c_shim_version = os.environ.get(
+    "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"
+)
+# dead code elimination
+dce = False
+# assume weight tensors are fixed size
+static_weight_shapes = True
+# put correctness assertions in generated code
+size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1"
+nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1"
+# enable loop reordering based on input orders
+pick_loop_orders = True
+# reuse a kernel input as the output
+inplace_buffers = True
+# reuse a buffer for an unrelated purpose
+allow_buffer_reuse = True
+# Enable pooled allocations for non-output tensors
+memory_planning = os.environ.get("TORCHINDUCTOR_MEMORY_PLANNING", "0") == "1"
+# How to organize memory under memory_planning=True:
+# - "none": do not try to pool storage, just reuse
+# - "intermediates": all non-outputs share storage, outputs each get unique storage
+# - "outputs": two pools, one for intermediates (freed on return) and one for outputs
+# - "combined": a single pool for both intermediates and outputs
+memory_pool = os.environ.get("TORCHINDUCTOR_MEMORY_POOL", "intermediates")
+# codegen benchmark harness
+benchmark_harness = True
+# fuse pointwise into templates
+epilogue_fusion = True
+# do epilogue fusions before other fusions
+epilogue_fusion_first = False
+# enable pattern match+replace optimizations
+pattern_matcher = True
+# register custom graph optimization pass hook. so far, pre/post passes are
+# only applied before/after pattern_matcher in post_grad_passes.
+#
+# def my_custom_pre_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# def my_custom_post_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass
+# torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass
+post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+# Registers a custom pregrad pass. Note that the pre-grad IR is 1.
+# non-functional, 2. non-normalized, and 3. prone to change. Ideally we should
+# use post-grad passes.
+pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+# Optimize away split cat patterns (Experimental)
+split_cat_fx_passes = True
+# Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability.
+efficient_conv_bn_eval_fx_passes = False
+# Enable predispatch aten IR for export
+is_predispatch = False
+# Deprecated
+group_fusion = False
+# Deprecated
+batch_fusion = True
+# Pre grad group/batch fusion and options in order, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions.
+pre_grad_fusion_options: Dict[str, Dict[str, Any]] = {
+    "batch_linear": {},
+    "batch_linear_lhs": {},
+    "batch_layernorm": {},
+    "batch_tanh": {},
+    "batch_relu": {},
+    "batch_sigmoid": {},
+}
+# Post grad group/batch fusion and options, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions.
+post_grad_fusion_options: Dict[str, Dict[str, Any]] = {}
+# enable reordering pass for improving memory locality
+reorder_for_locality = True
+# Scale down RBLOCK for better occupancy
+dynamic_scale_rblock = os.environ.get("TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCK", "1") == "1"
+# this forces fusion for int_mm with mul. Needed when you want to avoid realizing the int32
+# but the mul gets fused with other pointwise ops instead.
+force_fuse_int_mm_with_mul = False
+# for pattern torch.mm(a, b.to(dtype)) with cuda tensors,
+# enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel.
+# Autotune will compare perf with normal cast->then->mm option
+use_mixed_mm = False
+# enable runtime numeric check for pre/post grad fx passes
+# floating point provides limited accuracy (about 7 decimal digits for single precision
+# floating point numbers,about 16 decimal digits for double precision floating point numbers)
+# according to PyTorch documentation.
+# https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations
+fx_passes_numeric_check: Dict[str, Any] = {
+    "pre_grad": False,
+    "precision": 1e-4,
+    "num_iterations": 1,
+    "requires_optimizer": True,
+}
+# for pattern torch.mm(a, b.to(dtype)) with cuda tensors, always use
+# torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel.
+# Autotune will not compare with normal cast->then->mm option.
+# (if force_mixed_mm is true, the use_mixed_mm flag will be ignored)
+force_mixed_mm = False
+# enable reordering pass for increasing overlap between compute and communication
+reorder_for_compute_comm_overlap = False
+# passes (in execution order) for increasing overlap between compute and communication
+# for built-in passes, use string name; for user-defined passes, pass in the function handle
+reorder_for_compute_comm_overlap_passes = [
+    "reorder_compute_for_overlap",
+    "sink_waits",
+    "raise_comms",
+]
+# runtime estimation function for ops
+# for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
+estimate_op_runtime = "default"
+# unit: GB/s, uni-directional P2P bandwidth per card
+# default value is NVLink
+intra_node_bw = 300
+# unit: GB/s, uni-directional P2P bandwidth per node
+# default value is InfiniBand
+inter_node_bw = 25
+# enable slow autotuning passes to select algorithms
+max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
+# enable slow autotuning passes to select pointwise/reductions algorithms
+max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1"
+# enable slow autotuning passes to select gemm algorithms
+max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
+# enable autotune local cache
+use_autotune_local_cache = True
+# enable autotune remote cache
+use_autotune_remote_cache = (
+    os.environ.get("TORCH_INDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1"
+)
+# force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
+# when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
+# for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure
+# that triton does not use TF32 wherever cublas would not use TF32
+force_same_precision = (
+    True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1"
+)
+# Specify candidate backends for gemm autotune.
+# Possible choices are combinations of: ATen, Triton, CUTLASS.
+# ATen: default Pytorch ATen kernels.
+# Triton: Triton templates defined in torch inductor.
+# CUTLASS: Cutlass templates and kernels.
+max_autotune_gemm_backends = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON"
+).upper()
+# the value used as a fallback for the unbacked SymInts
+# that can appear in the input shapes (e.g., in autotuning)
+unbacked_symint_fallback = 8192
+# enable searching global and local cache regardless of `max_autotune`
+search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1"
+save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1"
+# We will disable creating subprocess for autotuning if this is False
+autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1"
+# If autotuning in subprocess, whether to use multiple devices
+autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1"
+coordinate_descent_tuning = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1"
+)
+coordinate_descent_check_all_directions = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1"
+)
+coordinate_descent_search_radius = int(
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1")
+)
+# Disabled by default on ROCm, opt-in if model utilises NHWC convolutions
+layout_opt_default = "1" if not torch.version.hip else "0"
+layout_optimization = (
+    os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1"
+)
+force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1"
+# Whether to keep the output strides the same as eager after layout optimization.
+keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1"
+# Enabling this will let compiler print warning messages if a generated triton
+# kernel has inputs with mixed layouts.  This is helpful for perf debugging
+# since kernel with mixed layout inputs may run much slower then one whose inputs
+# have uniform layouts.
+warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1"
+# control store vs recompute heuristic
+# For fanouts, rematerialization can lead to exponential blowup. So, have
+# smaller threshold
+realize_reads_threshold = 4
+realize_opcount_threshold = 30
+# Threshold to prevent excessive accumulation of ops in one buffer during lowering
+realize_acc_reads_threshold = 8
+# fallback to eager for random/dropout, this is slow but useful for debugging
+fallback_random = False
+# automatically create fallbacks when encountering an unhandled op
+implicit_fallbacks = True
+# fuse even in cases without common reads
+aggressive_fusion = False
+# For each fused kernel in the wrapper, comment with the nodes that get fused.
+# Useful for debugging fusion.
+debug_fusion = os.environ.get("TORCHINDUCTOR_DEBUG_FUSION") == "1"
+benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
+enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
+# how many nodes to allow into a single fusion
+max_fusion_size = 64
+# max number of inputs to generate cat as a pointwise op with masked laods
+max_pointwise_cat_inputs = 8
+# replace small reductions with pointwise, disable with `= 1`
+unroll_reductions_threshold = 8
+# Add extra comments to output code (causes compile cache misses)
+comment_origin = False
+# Convert 1x1 convs into matmuls
+conv_1x1_as_mm = False
+# Enable split reductions for better utilization when the dimension
+# being reduced over is large (by splitting it)
+split_reductions = True
+benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
+# Enable constant and index_expr folding
+constant_and_index_propagation = True
+# we always add constants into graph.constants without
+# performing any constant-inlining optimization
+always_keep_tensor_constants = False
+# assert that indirect indexing does not read / write out of bounds
+assert_indirect_indexing = True
+# constant folding on the joint graph
+joint_graph_constant_folding = True
+# Enable indirect_indexing asserts for decompositions and lowerings
+debug_index_asserts = False
+# warnings intended for PyTorch developers, disable for point releases
+is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
+developer_warnings = is_fbcode() or is_nightly_or_source
+# The multiprocessing start method to use for inductor workers in the codecache.
+# TODO: fork is not safe in a multithreaded environment, we should evaluate changing
+# the default to spawn.
+worker_start_method = "fork"
+def decide_compile_threads():
+    """
+    Here are the precedence to decide compile_threads
+    1. User can override it by TORCHINDUCTOR_COMPILE_THREADS.  One may want to disable async compiling by
+       setting this to 1 to make pdb happy.
+    2. Set to 1 if it's win32 platform or it's a fbcode build
+    3. decide by the number of CPU cores
+    """
+    if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
+        return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
+    elif sys.platform == "win32" or is_fbcode():
+        return 1
+    else:
+        cpu_count = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else os.cpu_count()
+        )
+        assert cpu_count
+        return min(32, cpu_count)
+compile_threads = decide_compile_threads()
+# gemm autotuning global cache dir
+if is_fbcode():
+    from libfb.py import parutil
+    try:
+        if __package__:
+            global_cache_dir = parutil.get_dir_path(
+                os.path.join(__package__.replace(".", os.sep), "fb/cache")
+            )
+        else:
+            global_cache_dir = parutil.get_dir_path("fb/cache")
+    except ValueError:
+        global_cache_dir = None
+else:
+    global_cache_dir = None
+# If kernel is fused, the name is generated from the origin node op names
+# for larger kernels limit this
+kernel_name_max_ops = 10
+# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
+shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1"
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+# Mark the wrapper call in PyTorch profiler
+profiler_mark_wrapper_call = False
+# Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for
+# every intermediate for which we can correlate it with an intermediate
+# from the original FX graph
+generate_intermediate_hooks = False
+# Populate traceback field on IRNode; good for debugging why origin_node is
+# not populated, or finding out where an IRNode was constructed
+debug_ir_traceback = False
+# used for debugging to make sure config is properly set
+_raise_error_for_testing = False
+_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "")
+profile_bandwidth = _profile_var != ""
+profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var
+# Specify a file where we print out the profiling results.
+# None means we do not dump results to a file.
+profile_bandwidth_output = os.environ.get("TORCHINDUCTOR_PROFILE_OUTPUT", None)
+# TODO: remove later
+disable_cpp_codegen = False
+# Freezing will attempt to inline weights as constants in optimization
+# and run constant folding and other optimizations on them. After freezing, weights
+# can no longer be updated.
+freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1"
+# Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead
+# of potentially keeping multiple copies of weights.
+freezing_discard_parameters: bool = False
+# Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests
+# should be run with this flag both on and off to make sure we have coverage.
+allow_stack_allocation: bool = (
+    os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1") == "1"
+)
+# Enables an alternate DSO interface (the "minimal ArrayRef interface") intended
+# to maximize performance for use cases that it can accommodate at the expense of
+# generality. In brief:
+# - inputs and outputs are ArrayRefTensor<T> (note that strides are required, but the
+#   tensor must be contiguous)
+# - constant handling is unchanged because it is not a per-inference-iteration bottleneck
+#
+# When the DSO is generated in this mode, the usual interface will also be supported,
+# but performance for that interface may be degraded.
+use_minimal_arrayref_interface: bool = False
+# decompose some memory bound matmul/bmm to mul
+decompose_mem_bound_mm: bool = False
+# config specific to codegen/cpp.py
+class cpp:
+    # set to torch.get_num_threads()
+    threads = -1
+    # Do not generate loops when the condition doesn't hold, like:
+    # for(long i0=4096; i0<4096; i0+=1)
+    no_redundant_loops = True
+    # Assume number of threads is dynamic, don't specialize thread number.
+    # Kernels don't recompile on thread number changes with this flag on.
+    # For single-threaded workload, turning it on would incur a slight
+    # performance degradation.
+    dynamic_threads = False
+    simdlen: Optional[int] = None
+    min_chunk_size = 4096
+    cxx = (
+        None,  # download gcc12 from conda-forge if conda is installed
+        # "g++-12",
+        # "g++-11",
+        # "g++-10",
+        # "clang++",
+        os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
+        # "g++.par",
+    )
+    # Allow kernel performance profiling via PyTorch profiler
+    enable_kernel_profile = False
+    # enable weight prepacking to get a better performance; may lead to large memory footprint
+    weight_prepack = True
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+    inject_log1p_bug_TESTING_ONLY: Optional[str] = None
+    # If None, autodetect whether or not AVX512/AVX2 can be used.  Otherwise,
+    # force usage as specified, without testing.
+    vec_isa_ok: Optional[bool] = None
+    # similar to config.triton.descriptive_names
+    descriptive_names = "original_aten"
+    # how many nodes to allow into a single horizontal fusion
+    max_horizontal_fusion_size = 16
+    # Make scatter_reduce fallback when reduce is sum to avoid performance regression
+    # using atomic_add.
+    fallback_scatter_reduce_sum = True
+    # Use funsafe-math-optimizations when compiling
+    enable_unsafe_math_opt_flag = False
+    # Use ffp-contract when compiling
+    enable_floating_point_contract_flag = False
+# config specific to codegen/triton.py
+class triton:
+    # Use cudagraphs on output code
+    cudagraphs = False
+    # Use cudagraph trees for memory pooling if `cudagraphs` is True
+    cudagraph_trees = True
+    # assertions not on the fast path, steady state
+    slow_path_cudagraph_asserts = True
+    # TODO - need to debug why this prevents cleanup
+    cudagraph_trees_history_recording = False
+    # assertions on the fast path
+    fast_path_cudagraph_asserts = False
+    # skip warmup for cudagraph trees
+    skip_cudagraph_warmup = False
+    # Synchronize before and after every compiled graph.
+    debug_sync_graph = False
+    # Synchronize after every kernel launch, to help pinpoint bugs
+    debug_sync_kernel = False
+    # Always load full blocks (rather than broadcasting inside the block)
+    dense_indexing = False
+    # limit tiling dimensions
+    max_tiles = 2
+    # use triton.autotune for pointwise ops with complex layouts
+    # this should only be disabled for debugging/testing
+    autotune_pointwise = True
+    # max autotune gemm with cublasLt
+    autotune_cublasLt = True
+    # should we stop a fusion to allow better tiling?
+    tiling_prevents_pointwise_fusion = True
+    tiling_prevents_reduction_fusion = True
+    # should we give different names to kernels
+    # Note: This is orthogonal to descriptive_names - this is deciding whether
+    # our triton kernel names should all be `triton_` (to maximize caching) or
+    # whether they should be unique.
+    unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1"
+    # should we put op names in kernel names
+    # False: No special names (just triton__1, triton__2, etc.)
+    # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
+    # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
+    # "inductor_node": Maps to the node name in the FX graph passed to Inductor
+    descriptive_names = "original_aten"
+    # use alternate codegen for smaller reductions
+    persistent_reductions = (
+        os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
+    )
+    # 0/False: disable
+    # 1/True: enable, use tuning to pick between different subkernels
+    # 2: enable, force using persistent reduction (for debugging)
+    # 3: enable, force using non-persistent reduction (for debugging)
+    multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0"))
+    # hint to Triton when arguments are divisible by 16
+    divisible_by_16 = True
+    # theses are not enforced, but they are used by asserts in triton_heuristics.py
+    # NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048
+    # Max RBLOCK will be large for multi-kernel since we do more aggressive
+    # persistent reduction.
+    max_block = {
+        "X": 2048,
+        "Y": 1024,
+        "Z": 1024,
+        "R": 4096 * (16 if multi_kernel else 1),
+    }
+    # Minimum RBLOCK to be used for a TritonSplitScanKernel
+    # NOTE: This also indirectly controls the size of workspace buffer required
+    min_split_scan_rblock = 256
+    # Store the generated cubin files for cpp wrapper code to load
+    store_cubin = False
+    # the max number of spills we allow for the configs we benchmark.
+    # Setting this to 0 means we skip a config if it spills even a single
+    # register.
+    # Setting it to a larger value allows a config spilling a small amount
+    # of registers being benchmarked.
+    #
+    # NOTE: triton will always report >0 register spills for kernels using sin/cos.
+    # (check this issue https://github.com/openai/triton/issues/1756 )
+    # So far we see a fixed 8 spilled registers for kernels using sin/cos.
+    # Raise the threshold to 16 to be safe.
+    # We should revisit this once we understand more of the source of register spills.
+    spill_threshold: int = 16
+    # Generate code containing the newer tl.make_block_ptr() API for loads/store
+    use_block_ptr = False
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+class aot_inductor:
+    # AOTInductor output path
+    # If an absolute path is specified, the generated lib files will be stored under the directory;
+    # If a relative path is specified, it will be used as a subdirectory under the default caching path;
+    # If not specified, a temp directory will be created under the default caching path.
+    # If the specified path contains something like "model.so", the sub-string will be used
+    # to name the generated library.
+    output_path = ""
+    debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
+    # Serialized tree spec for flattening inputs
+    serialized_in_spec = ""
+    # Serialized tree spec for flattening outputs
+    serialized_out_spec = ""
+    # flag to decide whether to create a submodule for constant graph.
+    use_runtime_constant_folding: bool = False
+class cuda:
+    # CUDA arch to use for CUDA template kernel compilation.
+    # e.g. "70", "75", "80", "90", etc.
+    # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
+    arch: Optional[str] = None
+    # CUDA version to use for CUDA template kernel compilation.
+    # e.g. "11.4", "12.1", etc.
+    # When version is None, Inductor uses torch.version.cuda.
+    version: Optional[str] = None
+    # Optimization level for the host compiler.
+    compile_opt_level = "-O1"
+    # Whether to enable device LTO (link-time-optimization).
+    enable_cuda_lto = False
+    # Whether to keep intermediate files dring compilation.
+    enable_ptxas_info = False
+    # Whether to enable debug info, e.g. line number, cutlass debug info.
+    enable_debug_info = False
+    # Whether to use fast math.
+    use_fast_math = False
+    # Path to the CUTLASS repo root directory.
+    # The default path only works under PyTorch local development environment.
+    cutlass_dir = os.environ.get(
+        "TORCHINDUCTOR_CUTLASS_DIR",
+        os.path.abspath(
+            os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/")
+        ),
+    )
+    # Configures the maximum number of CUTLASS configs to profile in max_autotune.
+    # By default it's None, so that all CUTLASS configs are tuned.
+    # This is mainly used to reduce test time in CI.
+    cutlass_max_profiling_configs: Optional[int] = None
+    # Path to CUDA NVCC.
+    # NVCC search order:
+    # 1) cuda_cxx set in this config
+    # 2）CUDACXX environment variable
+    # 3）CUDA_HOME environment variable
+    # 4) default system search PATH.
+    cuda_cxx: Optional[str] = None
+    # If set to True, it will ensure that only GEMM ops capable of
+    # epilogue fusion via CUTLASS Epilogue Visitor Trees ( EVT )
+    # are enabled for the CUTLASS backend.
+    cutlass_only_evt_capable_ops: bool = False
+# create a directory containing lots of debug information
+class trace:
+    # master switch for all debugging flags below
+    enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+    # Save debug information to a temporary directory
+    # If not specified, a temp directory will be created by system
+    debug_dir: Optional[str] = None
+    # Save python logger call >=logging.DEBUG
+    debug_log = False
+    # Save python logger call >=logging.INFO
+    info_log = False
+    # Save input FX graph (post decomps, pre optimization)
+    fx_graph = True
+    # Save FX graph after transformations
+    fx_graph_transformed = True
+    # Save TorchInductor IR before fusion pass
+    ir_pre_fusion = True
+    # Save TorchInductor IR after fusion pass
+    ir_post_fusion = True
+    # Copy generated code to trace dir
+    output_code = True
+    # SVG figure showing post-fusion graph
+    graph_diagram = os.environ.get("INDUCTOR_POST_FUSION_SVG", "0") == "1"
+    # SVG figure showing fx with fusion
+    draw_orig_fx_graph = os.environ.get("INDUCTOR_ORIG_FX_SVG", "0") == "1"
+    # We draw our fx graphs with the "record" shape attribute by default.
+    # Sometimes, when the graph is very complex, we may hit dot errors like below:
+    #   "flat edge between adjacent nodes one of which has a record shape -
+    #    replace records with HTML-like labels"
+    # and thus fail to generate a graph. So, let's give the user an option
+    # to specify the shape attribute for the dot graph. For example, passing
+    # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables
+    # to workaround the above failure.
+    dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None)
+    # Store cProfile (see snakeviz to view)
+    compile_profile = False
+    # Upload the .tar.gz file
+    # Needs to be overriden based on specific environment needs
+    upload_tar: Optional[Callable[[str], None]] = None
+    log_autotuning_results: bool = False
+_save_config_ignore = {
+    # workaround: "Can't pickle <function ...>"
+    "trace.upload_tar",
+}
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+from torch.utils._config_module import install_config_module
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/constant_folding.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import collections
+from typing import Any, Callable, Dict, Optional
+import torch
+import torch.utils._pytree as pytree
+aten = torch.ops.aten
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
+def replace_node_with_constant(gm, node, constant, name=None):
+    g = gm.graph
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0
+        i = gm._frozen_param_count
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
+        gm._frozen_param_count = i + 1
+    with g.inserting_before(node):
+        new_input_node = g.create_node("get_attr", qualname, (), {})
+        node.replace_all_uses_with(new_input_node)
+        new_input_node.meta.update(node.meta)
+        g.erase_node(node)
+    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+    gm.register_buffer(qualname, constant)
+    setattr(gm, qualname, constant)
+class ConstantFolder(torch.fx.Interpreter):
+    def __init__(
+        self,
+        gm,
+        skip_constructors=False,
+    ):
+        super().__init__(gm)
+        self.node_replacements: Dict[torch.fx.Node, Any] = {}
+        self.replaced_uses: Dict[torch.fx.Node, int] = collections.Counter()
+        self.unknown_value = object()
+        self.skip_constructors: bool = skip_constructors
+        # overwrite this to deallocate env values if their only remaining use
+        # is the output
+        self.user_to_last_uses = self.node_to_last_non_output_use()
+    def is_impure(self, node: torch.fx.node.Node):
+        if node.target in [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+        ]:
+            # For the pattern fp32_weight -> q -> dq
+            # We only folding fp32_weight -> q
+            # int8_weight and leave dq in graph to be fused
+            return True
+        return False
+    def node_to_last_non_output_use(self):
+        last_non_output_use = collections.defaultdict(list)
+        seen_uses = set()
+        output_node = next(iter(reversed(self.module.graph.nodes)))
+        for node in reversed(self.module.graph.nodes):
+            if node.target == "output":
+                continue
+            def add_use(inp):
+                if inp in seen_uses:
+                    return
+                seen_uses.add(inp)
+                last_non_output_use[node].append(inp)
+            pytree.tree_map_only(torch.fx.Node, add_use, (node.args, node.kwargs))
+            # if this node is only used in output, we want to gc it right away
+            if len(node.users) == 1 and output_node in node.users:
+                last_non_output_use[node].append(node)
+        return last_non_output_use
+    def run_node(self, node):
+        if node.target == "output":
+            # because we remove nodes from env on last non output use,
+            # re-define them now or we'll get error in interpreter
+            def set_env(arg):
+                self.env[arg] = self.unknown_value
+            pytree.tree_map_only(torch.fx.Node, set_env, node.args)
+            return super().run_node(node)
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+        if self.unknown_value in flattened_inputs:
+            return self.unknown_value
+        # TODO - fix errors with this
+        if (
+            node.op == "call_function"
+            and node.target == aten._efficientzerotensor.default
+        ):
+            return self.unknown_value
+        # TODO - constant folding triton kernel returns the inputs -- fix this
+        if (
+            node.op == "call_function"
+            and node.name == "triton_kernel_wrapper_functional_proxy"
+        ):
+            return self.unknown_value
+        # skip constructors, since inductor generates optimal code for them already
+        # and turning into tensor would result in an additional global memory read
+        # TODO - more complicated strategy
+        if (
+            self.skip_constructors
+            and node.op != "get_attr"
+            and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
+        ):
+            return self.unknown_value
+        # All mutations should either be removed or on inputs which we did not make constant
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            return self.unknown_value
+        out = super().run_node(node)
+        if node.op != "get_attr" and isinstance(out, torch.Tensor):
+            if not self.insertable_tensor_check(out):
+                return out
+            if self.is_impure(node):
+                return self.unknown_value
+            self.add_node_replacement(node, out)
+            flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            for n in flattened_node_inps:
+                if not isinstance(n, torch.fx.Node):
+                    continue
+                self.replaced_uses[n] += 1
+            for to_delete in self.user_to_last_uses.get(node, []):
+                if self.replaced_uses[to_delete] == len(to_delete.users):
+                    self.node_replacements.pop(to_delete, None)
+        return out
+    def insertable_tensor_check(self, tensor: torch.Tensor) -> bool:
+        return True
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor
+    def run(self):
+        env = {}
+        for n in self.module.graph.nodes:
+            if n.op == "placeholder":
+                env[n] = self.unknown_value
+        return super().run(initial_env=env)
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_fold(gm, constraint_fn: Optional[Callable[[torch.fx.Node], bool]] = None):
+    cf = ConstantFolder(gm, skip_constructors=True)
+    cf.run()
+    for node, constant in cf.node_replacements.items():
+        if constraint_fn is not None and not constraint_fn(node):
+            continue
+        replace_node_with_constant(gm, node, constant)
+    erased_params = []
+    for node in gm.graph.nodes:
+        if node.op == "get_attr" and len(node.users) == 0:
+            if hasattr(gm, node.target):
+                delattr(gm, node.target)
+            erased_params.append(node)
+    for node in erased_params:
+        gm.graph.erase_node(node)
+    gm.graph.eliminate_dead_code()
+    gm.graph.lint()
+    gm.recompile()
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_graph_tag(gm: torch.fx.GraphModule):
+    cf = ConstantFolder(gm, skip_constructors=True)
+    cf.run()
+    for node in gm.graph.nodes:
+        if (
+            node.op == "get_attr"
+            or node in cf.node_replacements
+            or node in cf.replaced_uses
+        ):
+            node.meta[META_TAG] = CONST_MODULE_TAG
+        else:
+            node.meta[META_TAG] = MODULE_TAG
+def run_and_get_constant_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+    constant_graph_tag(gm)
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            used_to_fold = False
+            for u in node.users:
+                if u.meta[META_TAG] == CONST_MODULE_TAG:
+                    used_to_fold = True
+                    break
+            if not used_to_fold:
+                node.meta[META_TAG] = MODULE_TAG
+    new_graph = torch.fx.Graph()
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    return new_gm

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/graph.py ADDED Viewed

	@@ -0,0 +1,1324 @@

+import itertools
+import logging
+import operator
+import os
+import re
+import sys
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from typing import Any, Callable, DefaultDict, Dict, List, Optional, Set, Tuple
+import sympy
+import torch
+import torch._logging
+import torch.fx
+from torch._decomp import get_decompositions
+from torch._dynamo.utils import defake, dynamo_timed
+from torch._logging import LazyString, trace_structured
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.fx.experimental.symbolic_shapes import has_free_symbols, ShapeEnv, SymTypes
+from torch.utils._mode_utils import no_dispatch
+from . import config, ir
+from .codegen.common import (
+    DeviceOpOverrides,
+    get_device_op_overrides,
+    get_scheduling_for_device,
+    get_wrapper_codegen_for_device,
+    register_backend_for_device,
+)
+from .codegen.cpp_wrapper_cpu import CppWrapperCpu
+from .codegen.cpp_wrapper_cuda import CppWrapperCuda
+from .codegen.wrapper import WrapperCodeGen
+from .exc import (
+    CppWrapperCodeGenError,
+    LoweringException,
+    MissingOperatorWithDecomp,
+    MissingOperatorWithoutDecomp,
+)
+from .ir import (
+    Constant,
+    FixedLayout,
+    InputBuffer,
+    Pointwise,
+    Reduction,
+    StorageBox,
+    TensorBox,
+)
+from .lowering import (
+    constrain_to_fx_strides,
+    FALLBACK_ALLOW_LIST,
+    fallback_handler,
+    fallback_node_due_to_unsupported_type,
+    layout_constraints,
+    lowerings,
+    make_fallback,
+    needs_realized_inputs,
+    unsupported_output_tensor,
+)
+from .sizevars import SizeVarAllocator
+from .utils import convert_shape_to_inductor, gather_origins, get_sympy_Expr_dtype
+from .virtualized import V
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+if config.is_fbcode():
+    from torch._inductor.fb.utils import log_module_code
+else:
+    def log_module_code(*args, **kwargs):
+        pass
+def supported_dtype_of_cpp_wrapper(dtype, cuda):
+    supported_dtype = {
+        torch.float32,
+        torch.float64,
+        torch.int64,
+        torch.int32,
+        torch.int16,
+        torch.int8,
+        torch.uint8,
+        torch.bool,
+        torch.bfloat16,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.float16,
+    }
+    if cuda:
+        supported_dtype.add(torch.float8_e4m3fn)
+        supported_dtype.add(torch.float8_e5m2)
+        supported_dtype.add(torch.float8_e4m3fnuz)
+        supported_dtype.add(torch.float8_e5m2fnuz)
+    return dtype in supported_dtype
+def may_get_constant_buffer_dtype(constant_buffer):
+    assert isinstance(
+        constant_buffer, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+    ), "get_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer"
+    if isinstance(constant_buffer, sympy.core.numbers.Integer):
+        return torch.int64
+    if isinstance(constant_buffer, sympy.Expr):
+        return get_sympy_Expr_dtype(constant_buffer)
+    if constant_buffer.is_integer:
+        return torch.int64
+    elif constant_buffer.is_float:
+        return torch.float32
+    else:
+        return None
+def is_magic_method(op):
+    magic_ops = {method_to_operator(m) for m in magic_methods}
+    return op in magic_ops
+def getattr_recursive(obj, target):
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+class GraphLowering(torch.fx.Interpreter):
+    graph_outputs: List[ir.IRNode]
+    def symbolic_sizes_strides(self, ex: torch.Tensor):
+        """
+        Support dynamic shapes and dynamic strides by assigning variables
+        to each dimension.  We duck-shape tensors, so if two tensors
+        have the same size they get assigned the same symbolic variable.
+        """
+        if self.reuse_shape_env:
+            return convert_shape_to_inductor(ex.size()), convert_shape_to_inductor(
+                ex.stride()
+            )
+        else:
+            from torch._dynamo.source import ConstantSource
+            # TODO: this should not be needed once #93059 lands
+            # https://github.com/pytorch/pytorch/pull/94031#discussion_r1096044816
+            # TODO: make a dedicated UnknownSource for this?
+            # NB: This is using the legacy default behavior from
+            # create_symbolic_sizes_strides_storage_offset but we hope we can
+            # just delete this entirely
+            source = ConstantSource(
+                f"__inductor_unknown_tensor_{len(self._shape_env.var_to_val)}"
+            )
+            (
+                size,
+                stride,
+                _,
+            ) = self._shape_env.create_symbolic_sizes_strides_storage_offset(
+                ex,
+                source,
+            )
+        size = [i.node.expr if isinstance(i, torch.SymInt) else i for i in size]
+        stride = [i.node.expr if isinstance(i, torch.SymInt) else i for i in stride]
+        return size, stride
+    def static_sizes_strides(self, ex: torch.Tensor):
+        """
+        Primarily used to weights
+        """
+        size = [sympy.Integer(i) for i in ex.size()]
+        stride = [sympy.Integer(i) for i in ex.stride()]
+        return size, stride
+    def init_backend_registration(self):
+        if get_scheduling_for_device("cpu") is None:
+            from .codegen.cpp import CppScheduling
+            register_backend_for_device("cpu", CppScheduling, WrapperCodeGen)
+        if get_scheduling_for_device("cuda") is None:
+            from .codegen.cuda_combined_scheduling import CUDACombinedScheduling
+            # CUDACombinedScheduling combines Triton and CUDA C++ scheduling for CUDA devices via delegation
+            register_backend_for_device("cuda", CUDACombinedScheduling, WrapperCodeGen)
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: Optional[List[torch.Tensor]] = None,
+        shape_env=None,
+        num_static_inputs=None,
+        graph_id=None,
+        cpp_wrapper=False,
+        aot_mode=False,
+        user_visible_outputs=frozenset(),
+        layout_opt=None,
+        extern_node_serializer=None,
+        is_inference=False,
+        is_const_graph=False,
+        const_output_index=None,
+        const_code=None,
+        const_module=None,
+        name=None,
+    ):
+        super().__init__(gm)
+        self.example_inputs = example_inputs
+        self.layout_opt = (
+            layout_opt
+            if layout_opt is not None
+            else self.decide_layout_opt(gm, is_inference=is_inference)
+        )
+        self.num_channels_last_conv = 0
+        self.is_inference = is_inference
+        self.is_const_graph = is_const_graph
+        self.const_code = const_code
+        self.const_module = const_module
+        self.extra_traceback = False  # we do our own error wrapping
+        if shape_env is None:
+            shape_env = ShapeEnv()
+            self.reuse_shape_env = False
+        else:
+            self._shape_env = shape_env
+            self.reuse_shape_env = True
+        self._shape_env = shape_env
+        self.sizevars = SizeVarAllocator(shape_env)
+        self.graph_input_names: List[str] = []
+        self.graph_inputs: Dict[str, TensorBox] = {}
+        self.graph_inputs_original: Dict[str, InputBuffer] = {}
+        self.device_types: Set[str] = (
+            const_module.device_types if const_module else set()
+        )
+        self.device_idxs: Set[int] = const_module.device_idxs if const_module else set()
+        self.cuda = False
+        self.buffers: List[ir.Buffer] = []
+        self.const_output_index: Dict[str, int] = (
+            const_output_index if const_output_index else {}
+        )
+        self.folded_constants: Set[str] = (
+            set(const_output_index.keys()) if const_output_index else set()
+        )
+        self.constants: Dict[str, torch.Tensor] = (
+            const_module.constants if const_module else {}
+        )
+        self.constant_reprs: Dict[str, str] = {}
+        self.removed_buffers: Set[str] = set()
+        self.removed_inplace_buffers: Set[str] = set()
+        self.mutated_buffers: Set[str] = set()
+        self.never_reuse_buffers: Set[str] = set()
+        self.inplaced_to_remove: Set[str] = set()
+        self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
+        self.wrapper_code: WrapperCodeGen = None  # type: ignore[assignment]
+        # See `ProxyExecutor Design Note` in ir.py for more details
+        self.extern_kernel_nodes: List[ir.ExternKernelNode] = []
+        self.extern_node_serializer: Optional[
+            Callable[[List[ir.ExternKernelNode]], Any]
+        ] = extern_node_serializer
+        self.current_node: torch.fx.Node = None  # type: ignore[assignment]
+        self.num_static_inputs = num_static_inputs
+        self.lists: Dict[str, List[str]] = {}
+        self.mutated_inputs: Set[str] = set()
+        self.mutated_input_idxs: List[int] = []
+        self.name_to_buffer: Dict[str, ir.Buffer] = {}
+        self.name_to_users: DefaultDict[str, List[ir.IRNode]] = defaultdict(list)
+        self.creation_time = time.time()
+        self.name = name
+        self.cpp_wrapper = cpp_wrapper
+        # record multi_kernel choice for cpp_wrapper so the second pass knows
+        # which sub-kernel is picked. Copy cpp_wrapper to another variable
+        # since cpp_wrapper flag is set to false for the first pass of codegen.
+        self.record_multi_kernel_choice = cpp_wrapper
+        self.multi_kernel_to_choice: Dict[str, int] = {}
+        self.aot_mode = aot_mode
+        self.graph_id = graph_id
+        self.scheduler: "torch._inductor.scheduler.Scheduler" = None  # type: ignore[assignment]
+        self.nodes_prefer_channels_last = (
+            self.find_nodes_prefer_channels_last() if self.layout_opt else set()
+        )
+        self._warned_fallback = {"aten.convolution_backward"}
+        self.user_visible_outputs = user_visible_outputs
+        self.cache_key: str = ""  # This is the cache key for the compiled artifact
+        self.cache_path: str = ""  # This is the path in the filesystem where the compiled artifact is stored
+        self.cache_linemap: List[
+            Tuple[int, str]
+        ] = (
+            []
+        )  # This is the linemap used by the profiler to mark custom compiled kernels getting run
+        # Used if lowering encounters cases where cudagraphs are not supported
+        self.disable_cudagraphs_reason: Optional[str] = None
+        # only keeping one node per device for stack trace purposes
+        self.device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
+        self.orig_gm: torch.fx.GraphModule = gm.__copy__()
+        self.dynamo_flat_name_to_original_fqn = self.module.meta.get(
+            "dynamo_flat_name_to_original_fqn", {}
+        )
+        self.allocated_constant_name = (
+            const_module.allocated_constant_name if const_module is not None else {}
+        )
+        self.init_backend_registration()
+    @staticmethod
+    def decide_layout_opt(gm, *, is_inference) -> bool:
+        """
+        Decide if we should enable layout optimization for this graph based on
+        heuristics.
+        """
+        if not config.layout_optimization:
+            return False
+        if config.force_layout_optimization:
+            return True
+        conv_nodes = [
+            n for n in gm.graph.nodes if n.target == torch.ops.aten.convolution.default
+        ]
+        nconv = len(conv_nodes)
+        if nconv == 0:
+            return False
+        # For cpu backend and mkldnn enabled, we always use channels_last for better performance.
+        if (
+            torch.backends.mkldnn.enabled
+            and torch.backends.mkldnn.is_available()
+            and all(
+                n.args[idx].meta["val"].device == torch.device("cpu")
+                for n in conv_nodes
+                for idx in [0, 1]
+            )
+        ):
+            return True
+        # Following models are skipped due to this:
+        # jx_nest_base
+        # volo_d1_224
+        if len(list(gm.graph.nodes)) >= 300 * nconv:
+            log.debug("Skipped layout opt because only a few conv")
+            return False
+        if any(
+            has_free_symbols(n.args[idx].meta["val"])
+            for n in conv_nodes
+            for idx in [0, 1]
+        ):
+            log.debug(
+                "See perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670"
+            )
+            return False
+        def is_grouped(n):
+            return n.args[-1] > 1 and n.args[1].meta["val"].size(1) > 1
+        def is_in_out_channel(n):
+            return (
+                n.args[1].meta["val"].size(0) * 2 <= n.args[1].meta["val"].size(1)
+                and n.args[1].meta["val"].size(2) > 1
+            )
+        def is_small_channel(n):
+            return (
+                n.args[1].meta["val"].size(0) <= 64
+                and n.args[1].meta["val"].size(1) <= 64
+            )
+        # only grouped convolutions benchmarked as slower in conv samples for inference only
+        if is_inference:
+            from torch.utils.flop_counter import FlopCounterMode
+            flop_counts: Dict[str, float] = defaultdict(float)
+            for node in conv_nodes:
+                success, args, kwargs = torch._inductor.fx_utils.get_fake_args_kwargs(
+                    node
+                )
+                if success:
+                    with FlopCounterMode(display=False) as flop_counter_mode:
+                        with V.fake_mode:
+                            node.target(*args, **kwargs)
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    if is_grouped(node):
+                        node_type = "grouped"
+                    elif is_small_channel(node):
+                        node_type = "small"
+                    elif is_in_out_channel(node):
+                        node_type = "in_out"
+                    else:
+                        node_type = "default"
+                    flop_counts[node_type] += counted_flops
+                else:
+                    log.debug("Conv inputs meta not found")
+            # average benchmarked channels last speedup / slowdown, < 1 is speedup.
+            # taken from the set of convolution inputs in benchmarks/dynamo/microbenchmarks/operator_inp_logs/torchbench_train/
+            # To regenerate these numbers follow https://gist.github.com/eellison/55d7a6ed6f39829d68ac56f95f4df5bb
+            GROUPED_MULTIPLIER = 1.358
+            DEFAULT_MULTIPLIER = 0.823
+            IN_OUT_MULTIPLIER = 0.725
+            SMALL_MULTIPLIER = 0.783
+            total_flops = sum(flop_counts.values())
+            # TODO - get different values per hardware
+            weighted_flops = (
+                flop_counts["grouped"] * GROUPED_MULTIPLIER
+                + flop_counts["small"] * SMALL_MULTIPLIER
+                + flop_counts["in_out"] * IN_OUT_MULTIPLIER
+                + flop_counts["default"] * DEFAULT_MULTIPLIER
+            )
+            do_layout_opt = weighted_flops <= total_flops
+            if not do_layout_opt:
+                log.debug(
+                    "Skipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %d",
+                    total_flops,
+                    weighted_flops,
+                )
+            return do_layout_opt
+        # Channels last layout can dramatically hurt grouped conv perf. E.g.
+        # Conv with arguments like
+        #   {"input_shape": [32, 224, 112, 112], "weight_shape": [224, 112, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 2}
+        # slows down 31x using channels last..
+        # But a lot of timm models use depthwise separable convolution which will
+        # result in grouped convolution with in-channel size == 1.
+        # For those grouped convolution, channels last still helps a lot.
+        # E.g.
+        # Conv with arguments
+        #   {"input_shape": [128, 58, 56, 56], "weight_shape": [58, 1, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 58}
+        # get 1.86x speedup with channels last layout.
+        #
+        # The following heuristics skip using channels-last if the model contains
+        # grouped convolution with in-channels > 1.
+        if any(map(is_grouped, conv_nodes)):
+            log.debug(
+                "Skip layout opt because found grouped convolution with >1 in_channels!"
+            )
+            return False
+        # For some models that contain convolution with larger in-channel than out-channel, applying
+        # channels last hurts performance.
+        # Following models are skipped due to this:
+        # - pytorch_unet
+        # - phlippe_densenet (slightly worse)
+        # - Background_Matting (1.22x -> 0.821x)
+        # - pytorch_CycleGAN_and_pix2pix (1.597x -> 1.294x)
+        if any(map(is_in_out_channel, conv_nodes)):
+            log.debug(
+                "Skip layout opt because some convolutions have smaller out_channel"
+            )
+            return False
+        # Following models are skipped due to this:
+        # - functorch_maml_omniglot
+        if all(map(is_small_channel, conv_nodes)):
+            log.debug("Skip layout opt because all convolution channels are too small")
+            return False
+        return True
+    def qualify_name(self, name: str) -> str:
+        """Prepend the given name with the graph name if any."""
+        if self.name is not None:
+            return f"{self.name}_{name}"
+        return name
+    def make_subgraph(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        subgraph_name: str,
+    ) -> "GraphLowering":
+        """
+        Make a subgraph of the current graph with all inherited
+        parts, except the graph module (`gm`) and `example_inputs`.
+        The subgraphs are lowered separately, but intended to be
+        inlined in the parent graph's codegening. Hence the need
+        for maintaining the same `shape_env` and other properties.
+        The subgraph name is qualified by the parent graph's name.
+        """
+        return GraphLowering(
+            gm=gm,
+            example_inputs=example_inputs,
+            shape_env=self._shape_env,
+            cpp_wrapper=self.cpp_wrapper,
+            aot_mode=self.aot_mode,
+            extern_node_serializer=self.extern_node_serializer,
+            is_inference=self.is_inference,
+            name=self.qualify_name(subgraph_name),
+        )
+    def find_nodes_prefer_channels_last(self):
+        """
+        The rule to decide if an node prefer channels last is simple.
+        1. if it's input/output of a convolution
+        2. if one of its user prefers channels last
+        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
+        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
+        channels last.
+        Consider the scenario: conv -> batch-norm -> relu -> conv
+        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
+        1. the output of batch-norm should be channels last initially since its input is a conv's output.
+           Forcing the batch-norm's output to be contiguous results in the first copy
+        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
+           We need convert it to channels last layout which results in the second copy.
+        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
+        can be saved.
+        """
+        output_set = set()
+        for n in reversed(self.module.graph.nodes):
+            if n.target == torch.ops.aten.convolution.default:
+                output_set.add(n)
+                continue
+            for user in n.users:
+                if user in output_set:
+                    output_set.add(n)
+                    break
+        # need a second pass to add downstream nodes of those channel last nodes to the sets.
+        # This pass is especially needed to avoid mix-layout kernel inputs in backward pass.
+        #
+        # Let's say a conv-batchnorm 's output is passed to relu whose output is in turn returned
+        # from the fwd graph. Without this second pass, we will force relu's output to be contiguous.
+        # Then in the kernel in backward pass, the contiguous output of relu may be mix with other channels last
+        # tensors and passed to a kernel.
+        #
+        # This pass improve yolov3 training speedup from 1.116x (worse than disabling layout optimization speedup 1.196x) to 1.457x.
+        # It also improves dla102 training speedup from 1.240x (worse than disabling layout optimization speedup 1.523x) to 1.835x .
+        # This also helps the following models:
+        # - res2net101_26w_4s
+        # - res2net50_14w_8s
+        # - sebotnet33ts_256
+        for n in self.module.graph.nodes:
+            if n in output_set:
+                for child in n.users:
+                    output_set.add(child)
+        return output_set
+    def warn_fallback(self, name):
+        if name not in self._warned_fallback:
+            self._warned_fallback.add(name)
+            perf_hint_log.info("Using FallbackKernel: %s", name)
+    def add_device_info(self, device: torch.device):
+        self.device_types.add(device.type)
+        if device.index is not None:
+            self.device_idxs.add(device.index)
+        if V.graph.current_node and device not in self.device_node_mapping:
+            self.device_node_mapping[device] = V.graph.current_node
+    @property
+    def fake_mode(self):
+        return V.fake_mode
+    def get_buffer(self, buffer_name: str):
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name]
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name]
+        return None
+    def get_dtype(self, buffer_name: str):
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].dtype
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name].get_dtype()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_dtype()
+        m = re.match(r"(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),", buffer_name)
+        if m:
+            return self.get_dtype(m.group(1))
+        raise KeyError(f"could not find {buffer_name}")
+    def get_numel(self, buffer_name: str):
+        from .ir import MultiOutputLayout
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].numel()
+        if buffer_name in self.name_to_buffer:
+            buf = self.name_to_buffer[buffer_name]
+            if isinstance(getattr(buf, "layout", None), MultiOutputLayout):
+                return 1
+            return buf.get_numel()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_numel()
+        raise KeyError(f"could not find {buffer_name}")
+    @dynamo_timed
+    def run(self, *args):
+        return super().run(*args)
+    def register_buffer(self, buffer: ir.Buffer):
+        name = self.qualify_name(f"buf{len(self.buffers)}")
+        self.buffers.append(buffer)
+        self.name_to_buffer[name] = buffer
+        # Skip empty CPU tensor so that CUDA graphs can succeed, see https://github.com/pytorch/pytorch/pull/114144
+        if not isinstance(buffer, ir.ComputedBuffer) or not buffer.is_zero_elements():
+            self.add_device_info(buffer.get_device())
+        return name
+    def register_list(self, buffer_names: List[str]):
+        name = self.qualify_name("list_" + "_".join(buffer_names))
+        self.lists[name] = buffer_names
+        return name
+    def register_users_of(self, node_output):
+        def register(value):
+            if isinstance(value, (list, tuple)):
+                for x in value:
+                    register(x)
+            if isinstance(value, ir.IRNode):
+                if (
+                    not hasattr(value, "data")
+                    or not isinstance(value.data, ir.IRNode)
+                    or not (
+                        hasattr(value.data, "data")
+                        and isinstance(value.data.data, ir.IRNode)
+                    )
+                ):
+                    return
+                for read_name in value.get_read_names():
+                    self.name_to_users[read_name].append(value)
+        register(node_output)
+    def mark_buffer_mutated(self, name: str):
+        """
+        When a buffer is mutated we need to make sure all the reads to
+        the old version are realized before the mutation happens.
+        """
+        assert isinstance(name, str)
+        self.mutated_buffers.add(name)
+        if name not in self.name_to_users:
+            return
+        for user in self.name_to_users[name]:
+            user.realize()
+    def add_tensor_constant(self, data, name=None):
+        def allocate(name):
+            if not config.aot_inductor.use_runtime_constant_folding:
+                for constant_name, value in self.constants.items():
+                    if (
+                        not data.is_mkldnn
+                        and data.size() == value.size()
+                        and data.stride() == value.stride()
+                        and data.dtype == value.dtype
+                        and data.device == value.device
+                        and torch.eq(data, value).all()
+                    ):
+                        return constant_name
+            if name is None:
+                name = f"constant{len(self.constants)}"
+            if name[0].isdigit():
+                name = f"constant_{name}"
+            name = self.qualify_name(name)
+            # We may generate a var name for each constant in the codegen.
+            # Let's only keep sane characters.
+            prefix = re.sub(r"[^a-zA-Z0-9_]", "_", name)
+            name = prefix
+            cnt = 0
+            while name in self.constants:
+                name = f"{prefix}_{cnt}"
+                cnt += 1
+            self.constants[name] = data
+            self.constant_reprs[name] = (
+                f"{data.device!r} {data.dtype!r} "
+                f"{tuple(data.size())!r} {tuple(data.stride())!r} "
+                f"{hash(data):x}"
+            )
+            return name
+        new_name = allocate(name)
+        self.allocated_constant_name[new_name] = name
+        return TensorBox.create(
+            ir.ConstantBuffer(
+                new_name,
+                FixedLayout(data.device, data.dtype, *self.static_sizes_strides(data)),
+            )
+        )
+    def constant_name(self, name: str, device_override: Optional[torch.device]):
+        """
+        We AOT copy constants to the devices they are needed on.
+        If device_override doesn't match the constant's device, then
+        copy it and return a different name.
+        """
+        if self.constants[name].device == device_override or device_override is None:
+            return name
+        alt_name = f"{name}_{device_override.type}{device_override.index or 0}"
+        if alt_name not in self.constants:
+            self.constants[alt_name] = self.constants[name].to(device_override)
+        return alt_name
+    def placeholder(self, target: str, args, kwargs):
+        example = super().placeholder(target, args, kwargs)
+        self.graph_input_names.append(target)
+        if isinstance(example, SymTypes):
+            expr = example.node.expr
+            self.graph_inputs[target] = expr
+            return expr
+        elif isinstance(example, (int, bool, float)):
+            expr = sympy.sympify(example)
+            self.graph_inputs[target] = expr
+            return expr
+        if isinstance(example, BackwardState):
+            # Ignored arg, must be unused
+            # Alternately we could filter this out in AotAutograd
+            return None
+        assert isinstance(example, torch.Tensor), example
+        # todo(chilli): We can remove the last check once we turn buffers into
+        # static shape tensors. That's a hack to workaround Inductor believing
+        # the buffer should be static but us passing in a fake tensor with
+        # symbolic shapes.
+        if not example._has_symbolic_sizes_strides:
+            # the first N inputs are weights
+            sizes, strides = self.static_sizes_strides(example)
+        else:
+            sizes, strides = self.symbolic_sizes_strides(example)
+        # TODO(jansel): handle input aliasing
+        target = self.qualify_name(target)
+        tensor = TensorBox.create(
+            InputBuffer(
+                target,
+                FixedLayout(example.device, example.dtype, sizes, strides),
+            )
+        )
+        self.graph_inputs[target] = tensor
+        self.graph_inputs_original[target] = tensor.data.data
+        self.add_device_info(example.device)
+        return tensor
+    def call_function(self, target, args, kwargs):
+        if target is operator.getitem and isinstance(args[0], (list, tuple, dict)):
+            return super().call_function(target, args, kwargs)
+        if hasattr(target, "_inductor_lowering_function"):
+            # passthrough lowerings from .pattern_matcher
+            return target(*args, **kwargs)
+        def get_custom_op_layout_constraints(target, args, kwargs):
+            # Custom operations that require preserving stride order
+            # which run through implicit fallback must constrain their
+            # arguments' fx strides
+            layout_constraint = None
+            if torch._C.Tag.needs_fixed_stride_order in target.tags:
+                # We have to set the current args because call_function will immediately
+                # evaluate this lowering after creating the fallback, without evaluating
+                # the layout constraint
+                args, kwargs = constrain_to_fx_strides(
+                    self.current_node, *args, **kwargs
+                )
+                # Also register the layout constraint so when the fallback
+                # is used again, we can constrain the args to the same layout
+                layout_constraint = constrain_to_fx_strides
+            return layout_constraint, args, kwargs
+        if target not in lowerings:
+            assert isinstance(
+                target, torch._ops.OpOverload
+            ), f"{target} is not an OpOverload"
+            base_name = target.name().split(".")[0]
+            if base_name in FALLBACK_ALLOW_LIST:
+                make_fallback(target)
+            elif config.implicit_fallbacks:
+                layout_constraint, args, kwargs = get_custom_op_layout_constraints(
+                    target, args, kwargs
+                )
+                error = (
+                    MissingOperatorWithDecomp
+                    if get_decompositions([target])
+                    else MissingOperatorWithoutDecomp
+                )
+                log.info(
+                    "Creating implicit fallback for:\n%s",
+                    error.operator_str(target, args, kwargs),
+                )
+                make_fallback(target, layout_constraint)
+            elif get_decompositions([target]):
+                # There isn't a good way to dynamically patch this in
+                # since AOT Autograd already ran.  The error message tells
+                # the user how to fix it.
+                raise MissingOperatorWithDecomp(target, args, kwargs)
+            else:
+                raise MissingOperatorWithoutDecomp(target, args, kwargs)
+        try:
+            log.debug("  via %s", lowerings[target])
+            out = lowerings[target](*args, **kwargs)
+            return out
+        except Exception as e:
+            raise LoweringException(e, target, args, kwargs).with_traceback(
+                e.__traceback__
+            ) from None
+    @staticmethod
+    def can_inline_constant(t: torch.Tensor) -> bool:
+        """
+        True if this is a small constant attr that will be inlined.
+        """
+        return len(t.shape) == 1 and t.shape[0] <= 8
+    def get_attr(self, target, args, kwargs):
+        # this is a constant
+        value = getattr_recursive(self.module, target)
+        if isinstance(value, torch.fx.GraphModule):
+            return ir.Subgraph(name=target, graph_module=value)
+        if (
+            config.aot_inductor.use_runtime_constant_folding
+            or config.always_keep_tensor_constants
+            or unsupported_output_tensor(value)
+        ):
+            return self.add_tensor_constant(value, target)
+        with no_dispatch():
+            if value.shape == ():
+                return Constant(value.item(), value.dtype, value.device)
+            if self.can_inline_constant(value):
+                # tensor lowering has constant inlining logic
+                from .lowering import tensor
+                return tensor(value.tolist(), dtype=value.dtype, device=value.device)
+        return self.add_tensor_constant(value, target)
+    def call_module(self, target, args, kwargs):
+        raise AssertionError()
+    def call_method(self, target, args, kwargs):
+        raise AssertionError()
+    def output(self, target, args, kwargs):
+        result = super().output(target, args, kwargs)
+        assert isinstance(result, (tuple, list)), type(result)
+        assert all(
+            isinstance(
+                x,
+                (
+                    TensorBox,
+                    ir.Constant,
+                    type(None),
+                    ir.ConstantBuffer,
+                    sympy.Expr,
+                    sympy.logic.boolalg.Boolean,
+                    int,
+                ),
+            )
+            for x in result
+        ), result
+        self.graph_outputs = [ir.ExternKernel.realize_input(x) for x in result]
+        value: ir.IRNode
+        for name, value in self.graph_inputs.items():
+            assert isinstance(
+                value, (TensorBox, sympy.Expr)
+            ), f"Unsupported inductor graph input type: {type(value)}"
+            if not isinstance(value, TensorBox):
+                continue
+            value.realize()
+            assert isinstance(value, TensorBox)
+            value = value.data
+            assert isinstance(value, ir.StorageBox)
+            value_storage_box = value
+            value = value.data
+            if not isinstance(value, InputBuffer) or value.get_name() != name:
+                # one of our inputs was mutated, need to turn that into a copy
+                ir.MutationLayout.realize_into(value, self.graph_inputs_original[name])
+                # replace output with mutated input
+                try:
+                    ind = self.graph_outputs.index(value_storage_box)
+                    self.graph_outputs[ind] = self.graph_inputs_original[name]
+                except ValueError:
+                    pass
+        self.finalize()
+        log.debug(
+            "Force channels last inputs for %d conv for the current graph with id %d",
+            self.num_channels_last_conv,
+            self.graph_id if self.graph_id is not None else -1,
+        )
+    def finalize(self):
+        for buf in self.buffers:
+            buf.decide_layout()
+    @contextmanager
+    def set_current_node(self, node: torch.fx.Node):
+        old = self.current_node
+        try:
+            self.current_node = node
+            yield
+        finally:
+            self.current_node = old
+    def run_node(self, n: torch.fx.Node):
+        def debug(msg):
+            log.debug("lowering %s %s", LazyString(n.format_node), msg)
+        origins = {n}
+        if n.op == "call_function":
+            args, kwargs = self.fetch_args_kwargs_from_env(n)
+            origins |= gather_origins(args, kwargs)
+        with ir.IRNode.current_origins(origins), self.set_current_node(
+            n
+        ), V.set_current_node(n):
+            if (
+                n.op == "call_function"
+                and n.target is not operator.getitem
+                and fallback_node_due_to_unsupported_type(n)
+            ):
+                debug("fallback_handler")
+                result = fallback_handler(n.target, add_to_fallback_set=False)(
+                    *args, **kwargs  # type: ignore[possibly-undefined]
+                )
+            elif n.op == "call_function" and n.target in layout_constraints:
+                debug("layout_constraints")
+                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)  # type: ignore[index]
+                result = self.call_function(n.target, args, kwargs)
+            elif is_magic_method(n.target):
+                # TODO: this is sus, it probably should be handled in the
+                # lowerings themselves similarly to sym_size/sym-stride
+                debug("is_magic_method")
+                if isinstance(n.meta["val"], torch.SymInt):
+                    result = n.meta["val"].node.expr
+                else:
+                    result = super().run_node(n)
+            else:
+                debug("")
+                result = super().run_node(n)
+            # require the same stride order for dense outputs,
+            # 1. user-land view() will not throw because inductor
+            # output different strides than eager
+            # long term the solution is to make view() always succeed
+            # with infallible strides.
+            # 2: as_strided ops, we need make sure its input has same size/stride with
+            # eager model to align with eager behavior.
+            as_strided_ops = [
+                torch.ops.aten.as_strided.default,
+                torch.ops.aten.as_strided_.default,
+                torch.ops.aten.as_strided_scatter.default,
+            ]
+            is_output = any(user.op == "output" for user in n.users)
+            is_input_for_as_strided = any(
+                user.target in as_strided_ops for user in n.users
+            )
+            if (
+                is_output
+                and isinstance(result, TensorBox)
+                and isinstance(result.data, ir.BaseView)
+            ):
+                # Realize so that outputs are correctly aliased
+                result.realize()
+            if (is_output or is_input_for_as_strided) and isinstance(
+                n.meta["val"], torch.Tensor
+            ):
+                strides = n.meta["val"].stride()
+                dense = torch._prims_common.is_non_overlapping_and_dense(n.meta["val"])
+                # requiring a stride order for a non-dense output wouldn't
+                # recreate the same strides, and would fail with view, defer for now.
+                if dense and len(strides):
+                    stride_order = ir.get_stride_order(strides)
+                    if (
+                        len(result.get_size()) == 4
+                        and n in self.nodes_prefer_channels_last
+                        and n.name not in self.user_visible_outputs
+                        and not is_input_for_as_strided
+                    ):
+                        stride_order = ir.NHWC_STRIDE_ORDER
+                    result = ir.ExternKernel.require_stride_order(result, stride_order)
+            # Realize if (1) any user need inputs realized, or (2) there is
+            # already too many reads and rematerializing can be bad.
+            num_users = len(set(n.users))
+            if num_users > 1 and isinstance(result, TensorBox):
+                for user in n.users:
+                    if user.target in needs_realized_inputs:
+                        result.realize_hint()
+                        # This inclusion is somewhat controversial (from
+                        # discussion between Horace, Natalia, and Elias).
+                        # Currently, it's not very clear why this is helpful.
+                        # The general idea here is that even though a node may
+                        # have FlexibleLayout, we still often *treat* it as if
+                        # it was contiguous. This appears to sometimes result in
+                        # suboptimal behavior.
+                        #
+                        # When we do a better job selecting layout, we should
+                        # revisit this.
+                        need_fixed_layout = [
+                            torch.ops.aten.convolution_backward.default,
+                            torch.ops.aten.mm.default,
+                            torch.ops.aten._int_mm.default,
+                        ]
+                        if not self.layout_opt:
+                            need_fixed_layout.append(torch.ops.aten.convolution.default)
+                        if torch._C._has_mkldnn:
+                            need_fixed_layout += [
+                                torch.ops.mkldnn._convolution_pointwise.default,
+                                torch.ops.mkldnn._convolution_pointwise.binary,
+                                torch.ops.mkldnn._convolution_pointwise_.binary,
+                                torch.ops.mkldnn._convolution_transpose_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.binary,
+                                torch.ops.aten.mkldnn_rnn_layer.default,
+                                torch.ops.onednn.qconv2d_pointwise.default,
+                                torch.ops.onednn.qconv2d_pointwise.binary,
+                                torch.ops.onednn.qlinear_pointwise.default,
+                                torch.ops.onednn.qlinear_pointwise.tensor,
+                            ]
+                            if torch._C.has_mkl:
+                                need_fixed_layout += [torch.ops.mkl._mkl_linear.default]
+                        if user.target in need_fixed_layout:
+                            result = ir.ExternKernel.require_stride_order(
+                                result, ir.get_stride_order(n.meta["val"].stride())
+                            )
+                    if user.op == "output":
+                        if isinstance(result.data.data, (Pointwise, Reduction)):
+                            result.realize()
+                # TODO(jansel): introduce a store vs inline choice
+                result.mark_reuse(len(n.users))
+            # Realize if the IRNode already has accumulated lots of reads
+            if isinstance(result, TensorBox) and result.has_exceeded_max_reads():
+                # Prevent excessive accumulation in a computed buffer, when
+                # there are multiple branches each with small number of memory
+                # reads, but they converge to a user.
+                result.realize_hint()
+            # Realize if a Pointwise has too much stuff to be inlined.
+            # As this may cause RecursionError during Inductor's evaluation.
+            if isinstance(result, TensorBox) and isinstance(result.data, StorageBox):
+                curr = result.data.data
+                if isinstance(curr, Pointwise):
+                    # Use inner fn as a rough proxy. Good enough.
+                    if curr.has_large_inner_fn():
+                        result.realize()
+        # This is not complete, but it doesn't have to be: origin_node
+        # tracking is best effort.  The logic here critically relies on direct
+        # TensorBox -> StorageBox denoting a non-view; we don't bother trying
+        # to get views to work.  Feel free to add any extra cases as needed.
+        #
+        # Note: we can't YOLO tree_map over this result, because if there are
+        # buffers or a view involved, we might not be able to validly assign
+        # the origin_node here.
+        if isinstance(result, TensorBox) and isinstance(result.data, ir.StorageBox):
+            if isinstance(result.data.data, ir.Loops):
+                result.data.data.origin_node = n
+            elif isinstance(result.data.data, ir.Buffer):
+                result.data.data.origin_node = n
+                if isinstance(result.data.data, ir.ComputedBuffer) and isinstance(
+                    result.data.data.data, ir.Loops
+                ):
+                    result.data.data.data.origin_node = n
+                # Not really multi-output, can straightforwardly recurse in
+                elif (
+                    isinstance(result.data.data, ir.MultiOutput)
+                    and not result.data.data.indices
+                ):
+                    if isinstance(result.data.data.inputs[0], ir.Buffer):
+                        result.data.data.inputs[0].origin_node = n
+        self.register_users_of(result)
+        return result
+    def validate_can_generate_cpp_wrapper(self):
+        if config.disable_cpp_codegen:
+            raise CppWrapperCodeGenError("C++ codegen is disabled")
+        if sys.platform not in ["linux", "darwin"]:
+            raise CppWrapperCodeGenError(f"Unsupported platform {sys.platform}")
+        for value in self.graph_inputs.values():
+            dtype = None
+            if isinstance(value, TensorBox):
+                dtype = value.get_dtype()
+            elif isinstance(
+                value, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+            ):
+                dtype = may_get_constant_buffer_dtype(value)
+            if not supported_dtype_of_cpp_wrapper(dtype, self.cuda):
+                raise CppWrapperCodeGenError(f"Unsupported input dtype {dtype}")
+    def init_wrapper_code(self):
+        self.cuda = "cuda" in self.device_types
+        if self.cpp_wrapper:
+            self.validate_can_generate_cpp_wrapper()
+            self.wrapper_code = CppWrapperCuda() if self.cuda else CppWrapperCpu()
+        else:
+            device_types = self.device_types.copy()
+            device_types.discard("cpu")
+            # TODO(Eikan): Only support mixing cpu and other device now.
+            assert len(device_types) <= 1, "Does not support mixing {}".format(
+                "+".join(device_types)
+            )
+            only_cpu = len(device_types) == 0
+            device_type = "cpu" if only_cpu else device_types.pop()
+            self.device_ops = get_device_op_overrides(device_type)
+            wrapper_code_gen_cls = get_wrapper_codegen_for_device(device_type)
+            assert (
+                wrapper_code_gen_cls is not None
+            ), f"Device {device_type} not supported"
+            self.wrapper_code = wrapper_code_gen_cls()
+        if self.const_module:
+            # If we have const module, we could reuse the kernels
+            # This could avoid duplication and save time on doing recompilation (if Triton.)
+            self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
+            self.wrapper_code.src_to_kernel = (
+                self.const_module.wrapper_code.src_to_kernel
+            )
+    def codegen_with_cpp_wrapper(self):
+        """
+        For CPU, the cpp wrapper codegen is done in one pass.
+        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
+        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
+        generate cpp wrapper code and compile it to a dynamic library in the second pass.
+        """
+        if "cuda" in self.device_types:
+            # first pass
+            self.cpp_wrapper = False
+            compiled = self.compile_to_module().call
+            def materialize(x):
+                if isinstance(x, (torch.SymInt, torch.SymFloat)):
+                    # Need concrete value to run dynamic shapes and tune the result
+                    return x.node.hint
+                elif isinstance(x, FakeTensor):
+                    return defake(x)
+                else:
+                    assert isinstance(
+                        x, torch.Tensor
+                    ), "Unknown type when creating real inputs" + str(type(x))
+                    return x
+            if tracing_context := torch._guards.TracingContext.try_get():
+                if tracing_context.output_strides:
+                    tracing_context.output_strides.clear()
+                params_flat = [
+                    param
+                    for param in tracing_context.params_flat  # type: ignore[union-attr]
+                    if param is not None
+                ]
+                real_inputs = [
+                    materialize(x) for x in itertools.chain(params_flat, V.real_inputs)
+                ]
+            else:
+                real_inputs = [materialize(x) for x in V.real_inputs]
+            with torch.utils._python_dispatch._disable_current_modes():
+                assert self.example_inputs is not None
+                compiled(real_inputs)
+            del real_inputs
+            # second pass
+            # TODO: reuse self.scheduler from the first pass to speed up the second pass
+            self.cpp_wrapper = True
+            self.removed_buffers.clear()
+            self.inplaced_to_remove.clear()
+            return self.codegen()
+        else:
+            # cpu
+            return self.codegen()
+    def codegen(self):
+        from .scheduler import Scheduler
+        self.init_wrapper_code()
+        self.scheduler = Scheduler(self.buffers)
+        V.debug.draw_orig_fx_graph(self.orig_gm, self.scheduler.nodes)
+        self.scheduler.codegen()
+        return self.wrapper_code.generate(self.is_inference)
+    def codegen_subgraph(self, parent_graph):
+        """
+        This is a more compact version of the `codegen()` above
+        where we codegen this graph as a subgraph of some parent
+        graph. The parent graph is passed as an argument: the
+        intention is to inline codegening of the subgraph in
+        the parent graph's wrapper code (including the generated
+        kerenls). The wrapper code is not finalized (via `.generate()`
+        call), as this will be done in the parent graph's `codegen()`.
+        """
+        from .scheduler import Scheduler
+        self.wrapper_code = parent_graph.wrapper_code
+        self.device_ops = parent_graph.device_ops
+        self.cpp_wrapper = parent_graph.cpp_wrapper
+        self.scheduler = Scheduler(self.buffers)
+        self.scheduler.codegen()
+    def count_bytes(self):
+        from .scheduler import Scheduler
+        scheduler = Scheduler(self.buffers)
+        total_bytes = 0
+        node_counts = []
+        node_runtimes = []
+        for node in scheduler.nodes:
+            num_bytes = node.get_read_write_buffers_sizes()
+            total_bytes += num_bytes
+            node_counts.append((node, num_bytes // 4))
+            node_runtimes.append((node, node.get_estimated_runtime()))
+        return total_bytes, node_counts, node_runtimes
+    @dynamo_timed(phase_name="code_gen")
+    def compile_to_module(self):
+        from .codecache import PyCodeCache
+        code, linemap = (
+            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+        )
+        linemap = [(line_no, node.stack_trace) for line_no, node in linemap]
+        key, path = PyCodeCache.write(code)
+        mod = PyCodeCache.load_by_key_path(
+            key, path, linemap=linemap, attrs=self.constants
+        )
+        self.cache_key = key
+        self.cache_path = path
+        self.cache_linemap = linemap
+        # Logged twice as per https://github.com/pytorch/pytorch/pull/99038#discussion_r1167826029
+        # TODO. Revisit this once the logging API is more mature
+        assert mod.__file__ is not None
+        log_module_code(mod.__file__)
+        log.debug("Output code written to: %s", mod.__file__)
+        output_code_log.debug("Output code: \n%s", code)
+        trace_structured(
+            "inductor_output_code",
+            lambda: {"filename": mod.__file__},
+            payload_fn=lambda: code,
+        )
+        output_code_log.info("Output code written to: %s", mod.__file__)
+        if config.benchmark_kernel:
+            print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
+        V.debug.output_code(mod.__file__)
+        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+        return mod
+    def compile_to_fn(self):
+        if self.aot_mode:
+            from .codecache import AotCodeCompiler
+            assert self.cpp_wrapper, "AOT mode only supports C++ wrapper"
+            code, linemap = self.codegen_with_cpp_wrapper()
+            output_code_log.debug("Output code: \n%s", code)
+            serialized_extern_kernel_nodes = None
+            if (
+                config.is_fbcode()
+                and self.extern_kernel_nodes
+                and self.extern_node_serializer
+            ):
+                serialized_extern_kernel_nodes = self.extern_node_serializer(
+                    self.extern_kernel_nodes
+                )
+                output_code_log.debug(
+                    "Serialized Extern Kernel Nodes: \n%s",
+                    serialized_extern_kernel_nodes,
+                )
+            # Directly return the file path with the compiled code
+            return AotCodeCompiler.compile(
+                self, code, serialized_extern_kernel_nodes, cuda=self.cuda
+            )
+        else:
+            return self.compile_to_module().call
+    def get_output_names(self):
+        return [
+            node.get_name()
+            for node in self.graph_outputs
+            if not isinstance(node, ir.NoneAsConstantBuffer)
+            and not isinstance(node, ir.ShapeAsConstantBuffer)
+        ]
+    def is_unspec_arg(self, name: str):
+        # dynamo wraps unspec variable as 0d CPU tensor,
+        # need to convert to scalar during codegen (triton only)
+        return (
+            name in self.graph_inputs.keys()
+            and self.graph_inputs[name].get_numel() == 1
+            and self.graph_inputs[name].get_device().type == "cpu"
+        )

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/ir.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/pattern_matcher.py ADDED Viewed

	@@ -0,0 +1,1524 @@

+from __future__ import annotations
+import dataclasses
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import os
+import re
+from collections import defaultdict
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    NoReturn,
+    Optional,
+    Set,
+    Union,
+)
+from typing_extensions import TypeGuard
+import torch
+import torch._guards
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import counters
+from torch._prims_common import is_integer_dtype
+from torch.fx import Node
+from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
+from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from .._functorch import config as functorch_config
+from .._functorch.aot_autograd import aot_function, make_boxed_func
+from .._functorch.partitioners import default_partition
+from .._subclasses import FakeTensorMode
+from ..fx import Transformer
+from . import config
+from .decomposition import select_decomp_table
+from .lowering import fallback_node_due_to_unsupported_type
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+Constant = Any
+NodeOrConstant = Union[Constant, torch.fx.Node]
+class Multiple:
+    pass
+# Sentinel indicating multiple quantities can be matched
+MULTIPLE = Multiple()
+class Match:
+    """
+    Represents a successfully matched pattern.
+    """
+    def __init__(self, pattern: PatternExpr, args=None, kwargs=None):
+        super().__init__()
+        self.pattern = pattern
+        # The input nodes that must be passed in to the result
+        self.args = args or []
+        self.kwargs = kwargs or {}
+        # The nodes matched in this expression
+        self.nodes: List[torch.fx.Node] = []
+        # Mapping CallFunction to the node.target
+        self.targets: Dict[_TargetExpr, torch.fx.node.Target] = {}
+        self.ctx: Optional[MatchContext] = None
+        self.replacement_graph: Optional[torch.fx.Graph] = None
+    @property
+    def graph(self) -> torch.fx.Graph:
+        assert self.ctx
+        return self.ctx.graph
+    def extend(self, other: Match):
+        if self.kwargs:
+            for key in set(self.kwargs.keys()) & set(other.kwargs.keys()):
+                if self.kwargs[key] != other.kwargs[key]:
+                    raise FailedMatch("kwarg mismatch: {}", key)
+        self.args.extend(other.args)
+        self.nodes.extend(other.nodes)
+        self.kwargs.update(other.kwargs)
+        self.targets.update(other.targets)
+    def bundle(self) -> Match:
+        # Wrap args in an extra list
+        self.args = [tuple(self.args)] if self.args else []
+        return self
+    def __repr__(self):
+        return f"Match(..., {self.args}, {self.kwargs})"
+    def erase_nodes(self, graph: torch.fx.Graph):
+        for n in reversed(self.nodes):
+            if not n._erased:
+                graph.erase_node(n)
+    def output_nodes(self) -> List[Optional[torch.fx.Node]]:
+        assert self.ctx
+        return [
+            (self.ctx.pattern_to_node[p] if p is not None else None)
+            for p in self.ctx.outputs
+        ]
+    def output_node(self) -> torch.fx.Node:
+        return next(p for p in self.output_nodes() if p)
+    def replace_with_graph(self, replacement_graph, args):
+        assert self.ctx
+        ReplacementPatternEntry.replace_with_graph(
+            self, self.ctx.graph, replacement_graph, args
+        )
+    def replace_by_example(self, replacement_fn, args, trace_fn=None, run_dce=True):
+        assert self.ctx
+        if trace_fn is None:
+            trace_fn = functools.partial(fwd_only, run_dce=run_dce)
+        replacement = trace_fn(
+            replacement_fn, torch.fx.map_arg(args, lambda arg: arg.meta["val"])
+        )
+        ReplacementPatternEntry.replace_with_graph(
+            self,
+            self.ctx.graph,
+            replacement,
+            args,
+        )
+class FailedMatch(RuntimeError):
+    def __init__(self, format_string, *args, **kwargs):
+        self.format_string = format_string
+        # We want to construct error messages lazily instead of eagerly, as
+        # constructing them eagerly can significantly worsen compile times.
+        if len(format_string) > 200:
+            raise RuntimeError(
+                f"Format string too long - use lazy construction of strings instead. Format string is\n {format_string}"
+            )
+        self.args = args
+        self.kwargs = kwargs
+    def __str__(self):
+        return self.format_string.format(*self.args, **self.kwargs)
+    def __bool__(self):
+        return False
+def is_match(m: Union[Match, FailedMatch]) -> TypeGuard[Match]:
+    """
+    TypeGuards cannot act on `self`. Thus this function exists to let mypy
+    recognize FailedMatch.__bool__ as a TypeGuard.
+    """
+    return bool(m)
+class MatchContext:
+    """
+    State needed while running PatternExpr._match().
+    """
+    def __init__(
+        self,
+        outputs: List[Optional[PatternExpr]],
+        pattern_to_node: Optional[Dict[PatternExpr, Node]] = None,
+        *,
+        graph: torch.fx.Graph,
+    ):
+        self.outputs = outputs
+        self.pattern_to_node = {} if pattern_to_node is None else pattern_to_node
+        self.graph = graph
+        self.exclusive_node_set: List[NodeOrConstant] = []
+    def match(self, pattern, node):
+        """wrapper to check reused nodes in patterns"""
+        if pattern in self.pattern_to_node:
+            if self.pattern_to_node[pattern] == node:
+                return Match(pattern)  # already checked this node
+            else:
+                return FailedMatch("repeated pattern differs")
+        m = pattern._match(node, self)
+        assert pattern not in self.pattern_to_node
+        self.pattern_to_node[pattern] = node if m else None
+        m.ctx = self
+        return m
+    def filter_multi_user_patterns(self):
+        return {
+            pattern: node
+            for pattern, node in self.pattern_to_node.items()
+            if pattern.has_multiple_users() and node is not None
+        }
+class PatternExpr:
+    """
+    Base class for types of patterns
+    """
+    def _match(
+        self, node: torch.fx.Node, ctx: MatchContext
+    ) -> Union[Match, FailedMatch]:
+        raise NotImplementedError()
+    def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
+        try:
+            return MatchContext([self], graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+    def has_multiple_users(self) -> bool:
+        return False
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+class Arg(PatternExpr):
+    """
+    Capture an arg which will become an input to the handler.  Args are
+    passed in depth first order.
+    """
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, args=[node])  # matches anything
+class Ignored(PatternExpr):
+    """
+    Match an arg, but don't pass it to handler
+    """
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self)  # matches anything
+    def __repr__(self):
+        return "*"
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        return "Ignored()"
+class KeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+    def __repr__(self):
+        return f"KeywordArg({self.name!r})"
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, kwargs={self.name: node})  # matches anything
+class ExclusiveKeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+    def __repr__(self):
+        return f"ExclusiveKeywordArg({self.name!r})"
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        if node in ctx.exclusive_node_set:
+            return FailedMatch("exclusive arg appears twice")
+        ctx.exclusive_node_set.append(node)
+        return Match(self, kwargs={self.name: node})  # matches anything
+class _TargetExpr(PatternExpr):
+    """
+    Base class for filtering match by node.target
+    """
+    op: Optional[str] = None
+    def __init__(self, fns, users=1):
+        if not self.op:
+            raise NotImplementedError("Shouldn't directly use _BaseNodeMatch")
+        super().__init__()
+        fns = [fns] if callable(fns) or isinstance(fns, str) else list(fns)
+        for fn in list(fns):
+            if isinstance(fn, torch._ops.OpOverloadPacket):
+                fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+        self.fns: List[Union[Callable[..., Any], str]] = fns
+        self.fns_set: Set[Union[Callable[..., Any], str]] = set(fns)
+        self.users: Union[int, Multiple] = users
+    def fns_repr(self) -> str:
+        first_repr = self.fns[0]
+        if not isinstance(first_repr, str):
+            first_repr = first_repr.__name__
+        if len(self.fns) > 1:
+            return f"[{first_repr}, ...]"
+        elif self.fns[0] is getattr(torch, first_repr, None):
+            return f"torch.{first_repr}"
+        elif isinstance(self.fns[0], torch._ops.OpOverload):
+            return str(self.fns[0])
+        else:
+            return first_repr
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.fns_repr()})"
+    def has_multiple_users(self) -> bool:
+        return isinstance(self.users, Multiple) or self.users > 1
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        raise NotImplementedError()
+    def _match_fns(self, node: torch.fx.Node):
+        return (
+            isinstance(node, torch.fx.Node)
+            and node.op == self.op
+            and extract_target(node) in self.fns_set
+        )
+    def _match_users(self, node: torch.fx.Node, ctx: MatchContext):
+        return (
+            self in ctx.outputs
+            or self.users is MULTIPLE
+            or len(node.users) == self.users
+        )
+class _TargetArgsExpr(_TargetExpr):
+    """
+    Base class for filtering match by node.{target,args,kwargs}
+    """
+    def __init__(self, fns, *args, _users=1, **kwargs):
+        super().__init__(fns, _users)
+        self.args = tuple(args)
+        self.kwargs = dict(kwargs)
+        if any(
+            isinstance(x, (dict, list, tuple))
+            for x in itertools.chain(args, kwargs.values())
+        ):
+            self.flatten = self.pytree_flatten
+        else:
+            self.flatten = self.simple_flatten
+        self.flat_args_kwargs = self.flatten(self.args, self.kwargs)
+    @staticmethod
+    def simple_flatten(args, kwargs: Dict[Any, Any]):
+        return (*args, *kwargs.values()), (len(args), *kwargs.keys())
+    @staticmethod
+    def pytree_flatten(args, kwargs: Dict[Any, Any]):
+        def norm_spec(s: pytree.TreeSpec):
+            if s.type is None:
+                return s
+            mapping = {immutable_list: list, tuple: list, immutable_dict: dict}
+            return pytree.TreeSpec(
+                mapping.get(s.type, s.type),
+                s.context,
+                list(map(norm_spec, s.children_specs)),
+            )
+        flat, spec = pytree.tree_flatten([args, kwargs])
+        spec = norm_spec(spec)
+        return flat, spec
+    def __repr__(self):
+        args = [
+            self.fns_repr(),
+            *map(repr, self.args),
+            *[f"{k}={v}" for k, v in self.kwargs.items()],
+        ]
+        return f"{self.__class__.__name__}({', '.join(args)})"
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        args = [
+            self.fns_repr(),
+            *(pp.pretty_print(x) for x in self.args),
+            *[f"{k}={pp.pretty_print(v)}" for k, v in self.kwargs.items()],
+        ]
+        if isinstance(self.users, Multiple):
+            args.append("_users=MULTIPLE")
+        elif self.users > 1:
+            args.append(f"_users={self.users}")
+        joiner_str = ", "
+        return f"{self.__class__.__name__}({joiner_str.join(args)})"
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        if not self._match_fns(node) or len(node.args) != len(self.args):
+            return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users {}", self)
+        _args = node.args
+        _kwargs = node.kwargs
+        if len(_kwargs) < len(self.kwargs):
+            from torch.fx.operator_schemas import normalize_function
+            normalized_args_and_kwargs = normalize_function(
+                node.target, node.args, node.kwargs
+            )
+            if normalized_args_and_kwargs is None:
+                return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+            else:
+                _args, _kwargs = normalized_args_and_kwargs
+                if len(_args) == len(self.args) and len(_kwargs) >= len(self.kwargs):
+                    _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+                else:
+                    return FailedMatch(
+                        "function_mismatch: node={}, pattern={}", node, self
+                    )
+        else:
+            _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+        node_items, node_spec = self.flatten(_args, _kwargs)
+        self_items, self_spec = self.flat_args_kwargs
+        if node_spec != self_spec:
+            return FailedMatch("args_structure {} {}", node_spec, self_spec)
+        assert len(node_items) == len(self_items)
+        m = Match(self)
+        for i, pattern, child_node in zip(itertools.count(), self_items, node_items):
+            if isinstance(pattern, PatternExpr):
+                child_match = ctx.match(pattern, child_node)
+                if not child_match:
+                    return child_match
+                m.extend(child_match)
+            elif isinstance(child_node, torch.fx.Node) or child_node != pattern:
+                return FailedMatch(
+                    "constant_args: {} {!r}!={pattern!r}", node, child_node
+                )
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        return m
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        """
+        This is used when we are matching a pattern with multiple outputs.
+        There is a partial match (stored in ctx) and we want to walk
+        this pattern to find a connection to an already-matched node.
+        Yields candidate nodes that `self._match` might like.
+        """
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+            return
+        for pattern in self.flat_args_kwargs[0]:
+            if isinstance(pattern, PatternExpr):
+                for other_node in pattern.find_anchor_nodes(ctx, searched):
+                    if not isinstance(other_node, torch.fx.Node):
+                        continue
+                    for node in other_node.users:
+                        if node not in searched:
+                            if self._match_fns(node):
+                                yield node
+                                searched.add(node)
+class CallFunction(_TargetArgsExpr):
+    """
+    Matches a call_function node in the FX graphs: `fns[i](*args, **kwargs)`
+    """
+    op = "call_function"
+class CallMethod(_TargetArgsExpr):
+    """
+    Matches a call_method node in the FX graphs: `fns[i].method(*args, **kwargs)`
+    """
+    op = "call_method"
+class CallModule(_TargetArgsExpr):
+    """
+    Matches a call_module node in the FX graphs: `module(*args, **kwargs)`
+    """
+    op = "call_module"
+class _TargetExprVarArgs(_TargetExpr):
+    """
+    Matches a call_function node with any arguments which are passed into the pattern
+    """
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        if not self._match_fns(node):
+            return FailedMatch("function_mismatch")
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users")
+        m = Match(self)
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        m.args.extend(node.args)
+        m.kwargs.update(node.kwargs)
+        return m
+class CallFunctionVarArgs(_TargetExprVarArgs):
+    op = "call_function"
+class CallMethodVarArgs(_TargetExprVarArgs):
+    op = "call_method"
+class CallModuleVarArgs(_TargetExprVarArgs):
+    op = "call_module"
+class ListOf(PatternExpr):
+    """
+    Matches a repeated pattern
+    """
+    def __init__(self, pattern: PatternExpr, partial=False):
+        super().__init__()
+        assert isinstance(pattern, PatternExpr)
+        self.pattern = pattern
+        self.partial = partial
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.pattern})"
+    def _match(self, node: List[torch.fx.Node], ctx: MatchContext):  # type: ignore[override]
+        if not isinstance(node, (list, tuple)) or len(node) == 0:
+            return FailedMatch("non_list")
+        m = Match(self)
+        # Propagating patterns with multiple users will ensure we don't revisit
+        # the same nodes
+        pattern_to_node = ctx.filter_multi_user_patterns()
+        matched = False
+        for i, child_node in enumerate(node):
+            child_ctx = MatchContext(
+                ctx.outputs, pattern_to_node, graph=child_node.graph
+            )
+            child_match = child_ctx.match(self.pattern, child_node)
+            pattern_to_node = child_ctx.filter_multi_user_patterns()
+            if not child_match:
+                if not self.partial:
+                    return FailedMatch("list[{}]: {}", i, child_match)
+                continue
+            matched = True
+            m.extend(child_match.bundle())
+        if not matched:
+            return FailedMatch("list: no_match")
+        return m.bundle()
+class MultiOutputPattern(PatternExpr):
+    def __init__(self, outputs):
+        super().__init__()
+        assert all(isinstance(x, (PatternExpr, type(None))) for x in outputs), outputs
+        self.outputs: List[Optional[PatternExpr]] = outputs
+    @property
+    def fns(self):
+        assert self.outputs[0] and hasattr(self.outputs[0], "fns")
+        return self.outputs[0].fns
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.outputs})"
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        args = [pp.pretty_print(x) for x in self.outputs]
+        joiner_str = f",\n{'  '}"
+        str_out = f"{self.__class__.__name__}([{joiner_str.join(args)}"
+        str_out = f"{str_out}\n])"
+        return str_out
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        m = ctx.match(self.outputs[0], node)
+        if not m:
+            return m
+        for pattern in self.outputs[1:]:
+            if pattern is None:
+                continue
+            child_match = self._match_from_anchors(pattern, ctx)
+            if not child_match:
+                return child_match
+            m.extend(child_match)
+        return m
+    def _match_from_anchors(self, pattern, ctx):
+        prior = dict(ctx.pattern_to_node)
+        m = FailedMatch("no anchor found")
+        for node in pattern.find_anchor_nodes(ctx, set()):
+            m = ctx.match(pattern, node)
+            if m:
+                return m
+            # revert any partial matches
+            ctx.pattern_to_node = dict(prior)
+        return m
+    def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
+        try:
+            return MatchContext(self.outputs, graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+class RepeatedExpr(PatternExpr):
+    """
+    Checks for a repeated pattern. Useful for repeated operations after a node such as `split` or `unbind`
+    """
+    def __init__(self, inner_pattern: PatternExpr):
+        super().__init__()
+        assert hasattr(inner_pattern, "fns")
+        self.inner_pattern = inner_pattern
+    @property
+    def fns(self):
+        return self.inner_pattern.fns
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        m = ctx.match(self.inner_pattern, node)
+        if not m:
+            return m
+        ctx.pattern_to_node.pop(
+            self.inner_pattern,
+        )
+        # Check all anchor nodes match the pattern
+        for anchor_node in self.inner_pattern.find_anchor_nodes(ctx, set()):
+            anchor_m = MatchContext([self], graph=node.graph).match(
+                self.inner_pattern, anchor_node
+            )
+            if not anchor_m:
+                return anchor_m
+            m.extend(anchor_m)
+        return m
+class PatternPrettyPrinter:
+    """
+    Serializes Patterns to executable python.
+    XXX: currently only used and tested for fuse attention patterns. May not cover
+    all patterns.
+    """
+    def __init__(self):
+        self.namespace = torch.fx.graph._Namespace()
+        self.memoized_objs_names: Dict[PatternExpr, str] = {}
+        self.memoized_objs_pp: Dict[PatternExpr, str] = {}
+    @staticmethod
+    def run(obj: PatternExpr, output_name="output"):
+        """
+        Serializes obj to python code with obj written out to `output_name`
+        """
+        pp = PatternPrettyPrinter()
+        assert hasattr(obj, "pretty_print")
+        out_str = obj.pretty_print(pp=pp)
+        output = []
+        for key in pp.memoized_objs_names:
+            output.append(f"{pp.memoized_objs_names[key]} = {pp.memoized_objs_pp[key]}")
+        output.append(f"{output_name} = {out_str}")
+        return "\n".join(output)
+    def pretty_print(self, obj):
+        if isinstance(obj, _TargetArgsExpr):
+            if memoized_name := self.memoized_objs_names.get(obj):
+                return memoized_name
+            else:
+                return self.memoize(obj)
+        if hasattr(obj, "pretty_print"):
+            return obj.pretty_print(self)
+        return repr(obj)
+    def memoize(self, obj):
+        obj_str = obj.pretty_print(self)
+        obj_name = obj.fns_repr()
+        for prefix in ("aten.", "torch.", "prims."):
+            obj_name = obj_name.replace(prefix, "")
+        tmp_name = self.namespace.create_name(obj_name, None)
+        self.memoized_objs_names[obj] = tmp_name
+        self.memoized_objs_pp[obj] = obj_str
+        return tmp_name
+@dataclasses.dataclass
+class PatternEntry:
+    pattern: PatternExpr
+    extra_check: Callable[[Match], bool]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        raise NotImplementedError()
+    def register(self, pass_dicts, target=None, prepend=False):
+        if target is None:
+            assert hasattr(self.pattern, "fns")
+            for fn in self.pattern.fns:
+                self.register(pass_dicts, fn, prepend=prepend)
+        elif isinstance(pass_dicts, (dict, PatternMatcherPass)):
+            if prepend:
+                pass_dicts[target].insert(0, self)
+            else:
+                pass_dicts[target].append(self)
+        else:
+            for x in pass_dicts:
+                self.register(x, target, prepend=prepend)
+@dataclasses.dataclass
+class LoweringPatternEntry(PatternEntry):
+    handler: Callable[..., Any]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        handler = functools.wraps(self.handler)(functools.partial(self.handler, match))
+        with graph.inserting_before(node):
+            replacement = graph.call_function(handler, tuple(match.args), match.kwargs)
+            replacement.meta.update(node.meta)
+            node.replace_all_uses_with(replacement)
+        assert match.nodes[-1] is node
+        match.erase_nodes(graph)
+@dataclasses.dataclass
+class GraphPatternEntry(PatternEntry):
+    """
+    A pattern that runs a function on the FX graph
+    """
+    handler: Callable[..., Any]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        with graph.inserting_before(node):
+            self.handler(match, *match.args, **match.kwargs)
+@dataclasses.dataclass
+class ReplacementPatternEntry(PatternEntry):
+    normalize_args: Callable[..., List[Any]]
+    @staticmethod
+    def replace_with_graph(
+        match: Match,
+        graph: torch.fx.Graph,
+        replacement_graph: torch.fx.Graph,
+        args: List[Any],
+    ):
+        output_nodes = match.output_nodes()
+        first_node = output_nodes[0]
+        class Replacer(torch.fx.Interpreter):
+            call_method = None  # type: ignore[assignment]
+            call_module = None  # type: ignore[assignment]
+            get_attr = None  # type: ignore[assignment]
+            def run_node(self, node) -> Any:
+                if node.op in ("placeholder", "output"):
+                    return super().run_node(node)
+                if node.op == "call_function":
+                    target = node.target
+                    args, kwargs = self.fetch_args_kwargs_from_env(node)
+                    result = graph.call_function(target, args, kwargs)
+                    if "val" in node.meta and "val" not in result.meta:
+                        result.meta["val"] = node.meta["val"]
+                        if isinstance(node.meta["val"], torch.Tensor):
+                            assert "tensor_meta" in node.meta
+                            result.meta["tensor_meta"] = node.meta["tensor_meta"]
+                    return result
+                raise NotImplementedError(f"unhandled {node}")
+        output_nodes = match.output_nodes()
+        if len(output_nodes) == 1:
+            last_node = output_nodes[0]
+        else:
+            assert output_nodes[0]
+            nodes = list(output_nodes[0].graph.nodes)
+            indices = [
+                (nodes.index(n), n)
+                for n in output_nodes
+                if isinstance(n, torch.fx.Node)
+            ]
+            last_node = min(indices, key=lambda tup: tup[0])[1]
+        def percolate_tags(node, recompute_tag, input_stops):
+            queue = [node]
+            visited = set()
+            while queue:
+                arg = queue.pop()
+                if (
+                    arg not in visited
+                    and arg not in input_stops
+                    and hasattr(arg, "meta")
+                ):
+                    visited.add(arg)
+                    arg.meta["recompute"] = recompute_tag
+                    queue.extend(arg.all_input_nodes)
+        with graph.inserting_before(last_node):
+            replacement = Replacer(replacement_graph).run(*args)
+            if isinstance(replacement, torch.fx.Node):
+                replacement = [replacement]
+            def maybe_getitem(node):
+                if node.op != "call_function":
+                    return None
+                if node.target != operator.getitem:
+                    return None
+                assert len(node.args) == 2
+                return node.args[1]
+            def replace(old, new):
+                if old is None:
+                    assert new is None
+                    return
+                assert isinstance(old, torch.fx.Node)
+                if new is None:
+                    old.replace_all_uses_with(None)
+                    graph.erase_node(old)
+                    return
+                if isinstance(new, torch.fx.Node):
+                    if "val" not in new.meta:
+                        new.meta.update(old.meta)
+                    # Preserve the recompute tags in the replacement graph. We
+                    # look at the recompute tags of the original output node to
+                    # propagate the tag from the output all the way to the input
+                    # args (named as args in the replace_with_graph).
+                    # Note that this is best effort. Since patterns are from
+                    # many to many, there is no easy way to correctly map the
+                    # recomputable tags. It is possible in some scenarios that we
+                    # incorrectly tag some nodes as recomputables.
+                    if "recompute" in old.meta:
+                        percolate_tags(new, old.meta["recompute"], args)
+                    old.replace_all_uses_with(new)
+                    graph.erase_node(old)
+                    return
+                # `new` is not a node: it's a list of nodes.
+                #
+                # This happens when we want to replace a node that has a single
+                # packed return with multiple unpacked returns. We need to do
+                # some graph surgery here.
+                #
+                # Example:
+                #   def original_graph(x):
+                #      a = op(x)
+                #      b = a[0]
+                #      c = a[1]
+                #      ...
+                #
+                # Assume that we want to replace op(x) with the graph
+                #   def new_op(x):
+                #      w = x + 1
+                #      z = x + 2
+                #      return (w, z)
+                #
+                # We need to replace `op` with the contents of `new_op`,
+                # and then rewrite a[0] to be w and a[1] to be z, as so:
+                #   def new_graph(x):
+                #     w = x + 1
+                #     z = x + 2
+                #     b = w
+                #     c = z
+                #     ...
+                old_uses = list(old.users.keys())
+                for user in old_uses:
+                    idx = maybe_getitem(user)
+                    if idx is None:
+                        raise AssertionError("can't handle")
+                    replace(user, new[idx])
+                graph.erase_node(old)
+            if len(output_nodes) == len(replacement):
+                for old, new in zip(output_nodes, replacement):
+                    replace(old, new)
+            else:
+                assert len(output_nodes) == 1
+                replace(output_nodes[0], replacement)
+        match.erase_nodes(graph)
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        self.replace_with_graph(
+            match,
+            graph,
+            match.replacement_graph,  # type: ignore[arg-type]
+            self.normalize_args(*match.args, **match.kwargs),
+        )
+def _return_true(match):
+    return True
+def log_trace_failure(search_fn, e):
+    log.info(
+        "Replacement pattern %s failed to apply due to shape mismatch: %s",
+        search_fn.__name__,
+        e,
+    )
+def register_replacement(
+    search_fn,
+    replace_fn,
+    example_inputs: Iterable[Any],
+    trace_fn: Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+    pass_dicts,
+    extra_check=_return_true,
+    scalar_workaround=(),
+    exclusive_arg_names=(),
+    search_fn_pattern=None,
+):
+    """
+    Create a replacement rule based on example functions that get traced
+    to create patterns.  This supports both training and inference when
+    run on a joint forward+backward graph.
+    Args:
+        search_fn: traced to give original pattern
+        replace_fn: traced to give replacement graph
+        example_inputs: example inputs for initial trace
+        trace_fn: fwd_only or joint_fwd_bwd
+        pass_dict: dict of passes to register to
+        extra_check: additional check to run on match(using real shapes)
+    """
+    argnames_static = [*inspect.signature(search_fn).parameters.keys()]
+    def check_fn(match: Match):
+        """
+        Often shapes get burned into the pattern, so our initial match ran with
+        `ignore_types=(int, ...)`.
+        Recheck the match with the correct shapes.
+        """
+        argnames = list(argnames_static)
+        for name in argnames:
+            if name not in match.kwargs:
+                raise RuntimeError(
+                    f"Not all inputs to pattern found in match.kwargs. Perhaps one "
+                    f"of the inputs is unused? argnames={argnames}, match.kwargs={match.kwargs}"
+                )
+        args = list(
+            torch.fx.map_arg(
+                [match.kwargs[name] for name in argnames], lambda n: n.meta["val"]
+            )
+        )
+        sym_args: List[torch.SymInt] = []
+        with torch._dynamo.utils.detect_fake_mode(args):
+            for i, grad in enumerate(requires_grad):
+                if isinstance(args[i], torch.Tensor):
+                    if grad and is_integer_dtype(args[i].dtype):
+                        return False
+                    args[i] = torch.empty_strided(
+                        args[i].size(),
+                        args[i].stride(),
+                        dtype=args[i].dtype,
+                        device=args[i].device,
+                        requires_grad=grad,
+                    )
+                    for v in itertools.chain(args[i].shape, args[i].stride()):
+                        if isinstance(v, torch.SymInt) and all(
+                            guard_size_oblivious(v != a) for a in sym_args
+                        ):
+                            sym_args.append(v)
+            if sym_args:
+                # AOT Autograd and make fx will dedupe symbolic shape size
+                # accesses of sym ints that appear as inputs
+                # We don't want the sym_size uses to interfere with pattern matching
+                # so we provide them as inputs.
+                # Later, when we actually do the replacement, the symbolic shape
+                # sizes will get re-traced and added to the graph.
+                def search_fn_new(*args_new):
+                    return search_fn(*args_new[len(args_new) - len(args) :])
+                try:
+                    specific_graph = trace_fn(search_fn_new, sym_args + args)
+                except RuntimeError as e:
+                    log_trace_failure(search_fn, e)
+                    return False
+                # correct argnames in the graph
+                sym_arg_names = []
+                for i, placeholder in zip(
+                    range(len(sym_args) + len(args)),
+                    specific_graph.graph.nodes,
+                ):
+                    if i < len(sym_args):
+                        sym_arg_names.append(placeholder.target)
+                        continue
+                    with specific_graph.graph.inserting_after(placeholder):
+                        new_node = specific_graph.graph.placeholder(
+                            argnames[i - len(sym_args)]
+                        )
+                        new_node.target = new_node.name
+                        placeholder.replace_all_uses_with(new_node)
+                        specific_graph.graph.erase_node(placeholder)
+                argnames = sym_arg_names + argnames
+            else:
+                try:
+                    specific_graph = trace_fn(search_fn, args)
+                except RuntimeError as e:
+                    log_trace_failure(search_fn, e)
+                    return False
+            specific_pattern = fx_to_pattern(
+                specific_graph,
+                argnames=argnames,
+                exclusive_arg_names=exclusive_arg_names,
+                scalar_workaround=scalar_workaround,
+            )
+            specific_pattern_match = specific_pattern.match(match.output_nodes()[0])  # type: ignore[arg-type]
+            if specific_pattern_match and extra_check(specific_pattern_match):
+                # trace the pattern using the shapes from the user program
+                match.replacement_graph = trace_fn(replace_fn, args)  # type: ignore[assignment]
+                return True
+            return False
+    def normalize_args(**kwargs):
+        args = []
+        for name in argnames_static:
+            args.append(kwargs.pop(name))
+        for i in range(1, len(kwargs) + 1):
+            if f"tangents_{i}" not in kwargs:
+                break
+            args.append(kwargs.pop(f"tangents_{i}"))
+        assert not kwargs, f"leftover kwargs: {kwargs!r}"
+        return args
+    if trace_fn is joint_fwd_bwd:
+        # If inference mode is enabled during compilation, assume that we don't
+        # want to match on any training graph patterns
+        if torch.is_inference_mode_enabled():
+            return False
+    # TODO: Revisit the functionalize_rng_ops for lowmem dropout
+    with functorch_config.patch(functionalize_rng_ops=False):
+        requires_grad: List[bool] = [
+            isinstance(x, torch.Tensor) and x.requires_grad for x in example_inputs
+        ]
+        if search_fn_pattern is None:
+            pattern = gen_pattern(
+                search_fn,
+                example_inputs,
+                trace_fn,
+                scalar_workaround,
+                exclusive_arg_names,
+            )
+        else:
+            pattern = search_fn_pattern
+        pattern_repr = PatternPrettyPrinter.run(pattern)
+        assert pattern_repr not in _seen_patterns
+        _seen_patterns.add(pattern_repr)
+        pattern = ReplacementPatternEntry(
+            pattern=pattern,
+            extra_check=check_fn,
+            normalize_args=normalize_args,
+        )
+        pattern.register(pass_dicts)
+        return pattern.pattern
+@functorch_config.patch(functionalize_rng_ops=False)
+def gen_pattern(
+    search_fn, example_inputs, trace_fn, scalar_workaround=(), exclusive_arg_names=()
+) -> PatternExpr:
+    argnames = [*inspect.signature(search_fn).parameters.keys()]
+    if scalar_workaround == ():
+        scalar_workaround = {}
+    flat_inputs = []
+    input_idx = 0  # Positional arguments index
+    for argname in argnames:
+        if argname in scalar_workaround:
+            flat_inputs.append(scalar_workaround[argname])
+        else:
+            flat_inputs.append(example_inputs[input_idx])
+            input_idx += 1
+    search_gm = trace_fn(search_fn, flat_inputs)
+    return fx_to_pattern(
+        search_gm,
+        ignore_types=(int, float, list, torch.device, torch.dtype),
+        argnames=argnames,
+        scalar_workaround=scalar_workaround,
+        exclusive_arg_names=exclusive_arg_names,
+    )
+def register_lowering_pattern(
+    pattern: PatternExpr, extra_check=_return_true, *, pass_dict, prepend=False
+):
+    """
+    Register an aten to inductor IR replacement pattern.  The decorated
+    function is saved and then called a lowering time allowing direct
+    pattern to inductor IR conversion.
+    """
+    def decorator(handler):
+        assert callable(handler)
+        LoweringPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        handler._inductor_lowering_function = True
+        return handler
+    return decorator
+def register_graph_pattern(
+    pattern: PatternExpr, extra_check=_return_true, *, pass_dict, prepend=False
+):
+    """
+    Register a pattern that runs a function on the FX graph, allowing
+    custom transformation code.
+    """
+    def decorator(handler):
+        assert callable(handler)
+        GraphPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        return handler
+    return decorator
+def is_start_of_fx_graph(graph: torch.fx.Graph, node: torch.fx.Node) -> bool:
+    # first node in the graph
+    return node is next(iter(graph.nodes))
+# match: copy_, relu_, _set_grad_enabled, manual_seed, enter_functional_autocast, etc
+_mutation_op_re = re.compile(r"_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_)")
+def is_mutation_op(node: torch.fx.Node) -> bool:
+    if node.op == "call_function":
+        if _mutation_op_re.search(node.target.__name__):  # type: ignore[union-attr]
+            return True
+    elif node.op == "call_method":
+        if _mutation_op_re.search(node.target):  # type: ignore[union-attr, arg-type]
+            return True
+    return node.kwargs.get("out") is not None
+def get_mutation_region_id(graph: torch.fx.Graph, node: torch.fx.Node) -> int:
+    n = node
+    while "mutation_region_id" not in n.meta and not is_start_of_fx_graph(graph, n):
+        n = n.prev
+    mutation_region_id = n.meta.get("mutation_region_id", 0)
+    while n is not node:
+        n = n.next
+        if is_mutation_op(n):
+            mutation_region_id += 1
+        n.meta["mutation_region_id"] = mutation_region_id
+    return mutation_region_id
+def should_compute_mutation_region_ids(graph: torch.fx.GraphModule) -> bool:
+    return "mutation_region_id" not in next(iter(graph.nodes)).meta
+def compute_mutation_region_ids(graph: torch.fx.GraphModule):
+    mutation_region_id = 0
+    for nd in graph.nodes:
+        if is_mutation_op(nd):
+            mutation_region_id += 1
+        nd.meta["mutation_region_id"] = mutation_region_id
+class PatternMatcherPass:
+    def __init__(
+        self, prevent_match_across_mutations=False, pass_name: Optional[str] = None
+    ):
+        super().__init__()
+        self.patterns: DefaultDict[
+            torch.fx.node.Target, List[PatternEntry]
+        ] = defaultdict(list)
+        self.prevent_match_across_mutations = prevent_match_across_mutations
+        self.pass_name = pass_name
+    def __getitem__(self, item: torch.fx.node.Target) -> List[PatternEntry]:
+        return self.patterns[item]
+    def apply(self, graph: torch.fx.GraphModule) -> int:
+        if not self.patterns:
+            return 0
+        if isinstance(graph, torch.fx.GraphModule):
+            graph = graph.graph
+        if self.prevent_match_across_mutations:
+            if should_compute_mutation_region_ids(graph):
+                compute_mutation_region_ids(graph)
+            get_mutation_region_id_partial = functools.partial(
+                get_mutation_region_id, graph
+            )
+        count = 0
+        for node in reversed(graph.nodes):
+            target = extract_target(node)
+            if (
+                node.op in ["call_function", "call_method", "call_module"]
+                and target in self.patterns
+            ):
+                # conservatively not applying pattern for cpu input,
+                # since some of the patterns induce codegen and split nodes.
+                # Note: we will only skip cpu compute if disable_cpp_codegen=True
+                if fallback_node_due_to_unsupported_type(node, allow_cpu_inputs=False):
+                    continue
+                for entry in self.patterns[target]:
+                    if node._erased:
+                        break
+                    m = entry.pattern.match(node)
+                    # pattern match crosses mutation barrier - discard
+                    if (
+                        self.prevent_match_across_mutations
+                        and is_match(m)
+                        and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1  # type: ignore[possibly-undefined]
+                    ):
+                        continue
+                    if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
+                        log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
+                    if is_match(m) and entry.extra_check(m):
+                        count += 1
+                        entry.apply(m, graph, node)  # type: ignore[arg-type]
+                        counters["inductor"]["pattern_matcher_count"] += 1
+                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+        return count
+    def clear(self):
+        self.patterns.clear()
+def _not_implemented(*args, **kwargs) -> NoReturn:
+    raise NotImplementedError()
+def fx_to_pattern(
+    gm,
+    ignore_types=(),
+    argnames=(),
+    scalar_workaround=(),
+    exclusive_arg_names=(),
+) -> PatternExpr:
+    """
+    Convert an FX graph into a PatternExpr.  This is useful for simple
+    patterns that can only match single functions and fixed-length lists.
+    """
+    # scalar_workaround is a hack to capture dropout_p
+    # see https://github.com/pytorch/pytorch/issues/97894
+    scalar_workaround = scalar_workaround or {}
+    inv_scalar_workaround = {v: k for k, v in scalar_workaround.items()}
+    assert len(inv_scalar_workaround) == len(scalar_workaround)
+    def process_arg(x):
+        if isinstance(x, (float, int)) and x in inv_scalar_workaround:
+            return KeywordArg(inv_scalar_workaround[x])
+        if type(x) in ignore_types:
+            return Ignored()
+        if isinstance(x, list) and all(isinstance(y, Ignored) for y in x) and x:
+            return Ignored()
+        return x
+    argnum = itertools.count()
+    class Converter(torch.fx.Interpreter):
+        call_method = _not_implemented
+        call_module = _not_implemented
+        get_attr = _not_implemented
+        def placeholder(self, target, args, kwargs):
+            n = next(argnum)
+            if n < len(argnames):
+                name = argnames[n]
+            elif argnames:
+                assert target.startswith("tangent")
+                name = target
+            else:
+                target = re.sub(r"_\d+$", "", target)  # de-mangle arg name
+                name = target
+            if name in exclusive_arg_names:
+                return ExclusiveKeywordArg(name)
+            else:
+                return KeywordArg(name)
+        def call_function(self, target, args, kwargs):
+            args, kwargs = pytree.tree_map(process_arg, (args, kwargs))
+            if list in ignore_types:
+                # Handle a burned in tensor size which are now [Ignored(), Ignored(), ...]
+                args = [process_arg(a) for a in args]
+                kwargs = {k: process_arg(a) for k, a in kwargs.items()}
+            return CallFunction(target, *args, **kwargs)
+        def run_node(self, n):
+            rv = super().run_node(n)
+            if n.op == "output" and isinstance(rv, tuple):
+                assert len(rv) == len(n.args[0])
+                for r, arg in zip(rv, n.args[0]):
+                    r.users = len(arg.users)
+            else:
+                rv.users = len(n.users)
+            return rv
+    pattern = Converter(gm).run()
+    if not isinstance(pattern, PatternExpr):
+        return MultiOutputPattern(pytree.tree_leaves(pattern))
+    return pattern
+@torch.no_grad()
+def fwd_only(fn, args, *, run_dce=True) -> torch.fx.GraphModule:
+    """Build a normalized inference graph, for use with fx_to_pattern"""
+    # TODO - look into using aot autograd, asserting no mutating ops here
+    with enable_python_dispatcher():
+        mode = (
+            "real" if not torch._inductor.utils.any_is_symbolic(*args) else "symbolic"
+        )
+        gm = make_fx(fn, select_decomp_table(), tracing_mode=mode)(*args)
+    if run_dce:
+        gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+@torch.enable_grad()
+def joint_fwd_bwd(fn, args) -> torch.fx.GraphModule:
+    """Build a normalized training graph, for use with fx_to_pattern"""
+    gm: Optional[torch.fx.GraphModule] = None
+    def record_joint_graph(joint_graph, inputs, **kwargs):
+        nonlocal gm
+        assert not gm
+        gm = clone_graph(joint_graph)
+        return default_partition(joint_graph, inputs, **kwargs)
+    with torch._guards.tracing(None):
+        aot_function(
+            fn,
+            lambda g, i: make_boxed_func(g),
+            partition_fn=record_joint_graph,
+            decompositions=select_decomp_table(),
+            keep_inference_input_mutations=True,
+            enable_log=False,
+        )(*args)
+    assert gm
+    from .fx_passes.joint_graph import pointless_view
+    matcher_pass = PatternMatcherPass()
+    pattern = CallFunction(
+        torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")
+    )
+    GraphPatternEntry(
+        pattern=pattern, handler=pointless_view, extra_check=_return_true
+    ).register(matcher_pass.patterns)
+    matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+    # remove in/out specs
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]:
+    args: List[torch.fx.node.Argument] = list()
+    torch.fx.map_arg((n.args, n.kwargs), args.append)
+    return args
+def stable_topological_sort(graph: torch.fx.Graph):
+    # Nodes are in exactly one of these three collections:
+    # - Nodes in `pending` are waiting to be processed (in reverse order):
+    pending = list(reversed(graph.nodes))
+    # - Nodes in `ready` have been processed and are already in the correct
+    #   order.
+    ready = set()
+    # - `waiting` is a mapping from a dependency to nodes which depend on that
+    #   dependency.
+    waiting = defaultdict(list)
+    # The cursor indicates the last processed node so we can add new nodes
+    # after it.
+    cursor = None
+    while pending:
+        node = pending.pop()
+        waiting_for = [x for x in _args(node) if x not in ready]
+        if waiting_for:
+            # We have unprocessed input nodes. Might as well wait for the last
+            # arg so an already sorted list will only recheck this node once.
+            waiting[waiting_for[-1]].append(node)
+        else:
+            ready.add(node)
+            if cursor and cursor.next is not node:
+                cursor.append(node)
+            cursor = node
+            # Mark the nodes that have been waiting for this node to finish as
+            # ready to check again.
+            pending.extend(reversed(waiting.pop(node, ())))
+    assert not waiting and len(ready) == len(graph.nodes)
+def init_once_fakemode(fn: Callable[..., Any]):
+    """Wrapper around lazy init functions in fx_passes/"""
+    @functools.lru_cache(None)
+    @functools.wraps(fn)
+    def lazy_init():
+        counters_ref = counters["inductor"].copy()
+        with torch._guards.tracing(
+            None
+        ), maybe_disable_fake_tensor_mode(), FakeTensorMode():
+            result = fn()
+        # clear view matches encountered during tracing
+        counters["inductor"] = counters_ref
+        return result
+    return lazy_init
+def config_flag(name):
+    """Function for extra_check to put pass behind a flag"""
+    def flag_check(match):
+        return getattr(config, name)
+    return flag_check
+def clone_graph(input_graph: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    class CopyGraph(Transformer):
+        def run_node(self, old_node):
+            new_node = super().run_node(old_node)
+            if isinstance(new_node, torch.fx.Proxy):
+                new_node.node.meta.update(old_node.meta)
+                new_node.node.name = self.new_graph._graph_namespace.create_name(
+                    old_node.name, None
+                )
+            return new_node
+    return CopyGraph(input_graph).transform()
+_seen_patterns: Set[str] = set()
+def get_arg_value(
+    node: torch.fx.Node, arg_number: int, kwarg_name: Optional[str] = None
+):
+    return (
+        node.args[arg_number]
+        if len(node.args) > arg_number
+        else node.kwargs.get(kwarg_name)  # type: ignore[arg-type]
+    )
+def filter_nodes(nodes: Iterable[torch.fx.Node], fn) -> List[torch.fx.Node]:
+    fns = [fn]
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+    return [node for node in nodes if node.target in fns]
+def extract_target(node: Node):
+    """For call_function and call_method, we directly use the target function;
+    For call_module, the target is string, and we treat the module class
+     as a function.
+    """
+    if node.op == "call_module":
+        return getattr(node.graph.owning_module, node.target).__class__  # type: ignore[arg-type]
+    return node.target

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/scheduler.py ADDED Viewed

	@@ -0,0 +1,2445 @@

+import collections
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import operator
+import os
+import pprint
+import textwrap
+from typing import (
+    Any,
+    Counter,
+    DefaultDict,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import sympy
+import torch
+from torch._dynamo.utils import dynamo_timed
+from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+from torch.utils._triton import has_triton
+from . import comms, config, dependencies, ir, metrics
+from .codegen.common import get_scheduling_for_device, Kernel
+from .comm_analysis import estimate_nccl_collective_runtime
+from .dependencies import Dep, MemoryDep, StarDep, WeakDep
+from .ir import ComputedBuffer, MultiOutput, MultiOutputLayout
+from .sizevars import SimplifyIndexing
+from .utils import (
+    cache_on_self,
+    cmp,
+    free_symbol_has,
+    get_device_tflops,
+    get_dtype_size,
+    get_gpu_dram_gbps,
+    green_text,
+    is_collective,
+    is_wait,
+    red_text,
+    sympy_product,
+)
+from .virtualized import V
+log = logging.getLogger(__name__)
+fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
+class WhyNoFuse:
+    # TODO when we drop support for Python < 3.10, we can use
+    # @dataclass(slots=True) instead of manually specifying __slots__.
+    __slots__ = ["node1", "node2", "reason", "args"]
+    reason: str
+    args: Tuple[Any, ...]
+    def __init__(self, node1: "BaseSchedulerNode", node2: "BaseSchedulerNode"):
+        self.node1 = node1
+        self.node2 = node2
+    def __call__(self, reason, *args):
+        self.reason = reason
+        self.args = args
+        fusion_log.debug(self)
+    def __str__(self):
+        return f"cannot fuse {self.node1.get_name()} with {self.node2.get_name()}: " + (
+            self.reason % self.args
+        )
+def pformat(obj):
+    if isinstance(obj, set):
+        # pformat has trouble with sets of sympy exprs
+        obj = sorted(obj, key=str)
+    result = pprint.pformat(obj, indent=4)
+    if "\n" in result:
+        return f"\n{textwrap.indent(result, ' '*4)}"
+    return result
+class OutputNode:
+    def __init__(self, dep):
+        self.unmet_dependencies = {dep}
+        self.inverse_users = []
+    def is_reduction(self):
+        return False
+    def get_alias_names(self):
+        return ()
+    def get_name(self):
+        return "OUTPUT"
+    __repr__ = get_name
+def _prune_redundant_deps(node, name_to_fused_node):
+    """
+    Prunes weakdeps intended for mutation ordering
+    on an upstream fused node if after fusion there is another dependency
+    on the fused upstream node, making the weakdep redundant
+    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
+    be incrementally removed, enabling other fusions, ensuring they are fused in order.
+    """
+    name_to_dep_count: Counter[str] = collections.Counter()
+    for dep in node.unmet_dependencies:
+        if not isinstance(dep, WeakDep):
+            name_to_dep_count[name_to_fused_node[dep.name].get_name()] += 1
+    def should_prune(dep):
+        if isinstance(dep, WeakDep):
+            is_redundant = (
+                name_to_dep_count[name_to_fused_node[dep.name].get_name()] > 0
+            )
+            # These can occur because fused nodes always gather deps from their snodes
+            # If B has a weakdep on A
+            # B gets fused with C, then any time BC is fused, the weakdep will reappear
+            is_self_dep = name_to_fused_node[dep.name] == node
+            return is_redundant or is_self_dep
+        else:
+            return False
+    deps_to_prune = {dep for dep in node.unmet_dependencies if should_prune(dep)}
+    if deps_to_prune:
+        node.unmet_dependencies = node.unmet_dependencies - deps_to_prune
+        node.set_read_writes(node.read_writes.remove_reads(deps_to_prune))
+# TODO(xmfan): reuse an existing mapping for this if it exists, or formalize this into ir.py:ExternKernel
+kernel_name_to_op = {
+    "extern_kernels.convolution": torch.ops.aten.convolution,
+    "extern_kernels.mm": torch.ops.aten.mm,
+    "extern_kernels.bmm": torch.ops.aten.bmm,
+    "extern_kernels.addmm": torch.ops.aten.addmm,
+}
+class BaseSchedulerNode:
+    def __init__(self, scheduler: "Scheduler", node: ir.Buffer):
+        self.scheduler: Scheduler = scheduler
+        self.node: ir.Buffer = node
+        self.users: List[NodeUser] = []
+        self.inverse_users: List[BaseSchedulerNode] = []
+        self.node_users: List[BaseSchedulerNode] = []
+        self.set_read_writes(node.get_read_writes())
+        self.ancestors: Set[str] = set()
+        self.min_order: int
+        self.max_order: int
+        self.last_usage: Set[
+            str
+        ] = set()  # buffers that won't be used after this kernel
+        self.written = False
+    def __repr__(self):
+        return f"{type(self).__name__}(name={self.get_name()!r})"
+    def debug_str(self) -> str:
+        """Longer form printout for trace logs"""
+        name = self.get_name()
+        lines = [
+            f"{name}: {type(self).__name__}({type(getattr(self, 'node', None)).__name__})",
+            f"{name}.writes = {pformat(self.read_writes.writes)}",
+            f"{name}.unmet_dependencies = {pformat(self.unmet_dependencies)}",
+            f"{name}.met_dependencies = {pformat(self.read_writes.reads - self.unmet_dependencies)}",
+            f"{name}.users = {self.users}",
+        ]
+        try:
+            lines += [
+                self.debug_str_extra(),
+            ]
+        except Exception:
+            log.warning("Ignoring error in debug_str()", exc_info=True)
+        return "\n".join(lines).rstrip()
+    def debug_str_extra(self) -> str:
+        return ""
+    def log_details(self):
+        log.info(
+            "%s: unmet_dependencies = %s, writes = %s",
+            self,
+            self.unmet_dependencies,
+            self.read_writes.writes,
+        )
+    def update_mutated_names(self, renames: Dict[str, str]):
+        self.set_read_writes(self.read_writes.rename(renames))
+    def add_mutation_dep(self, dep):
+        self.set_read_writes(self.read_writes.with_read(dep))
+    def add_fake_dep(self, dep):
+        self.set_read_writes(self.read_writes.with_read(dep))
+    def set_users(self, users: List["NodeUser"]):
+        # deduplicate
+        result: Dict[int, NodeUser] = {}
+        for use in users:
+            if id(use.node) in result:
+                result[id(use.node)] = use.merge(result[id(use.node)])
+            else:
+                result[id(use.node)] = use
+        self.users = list(result.values())
+    def set_last_usage(
+        self, future_used_buffers: Set[str], mutation_real_name: Dict[str, str]
+    ):
+        used_buffers = self.used_or_aliased_buffer_names()
+        used_buffers = {mutation_real_name.get(k, k) for k in used_buffers}
+        self.last_usage = used_buffers - future_used_buffers
+    def get_aliases(self):
+        return self.node.get_alias_names()
+    def get_mutations(self):
+        return self.node.get_mutation_names()
+    def has_aliasing_or_mutation(self):
+        return bool(self.get_aliases() or self.get_mutations())
+    def set_read_writes(self, rw: dependencies.ReadWrites):
+        self.read_writes: dependencies.ReadWrites = rw
+        self.unmet_dependencies = self.read_writes.reads
+        self.prune_deps()
+    def op_counts(self):
+        return self.read_writes.op_counts
+    def used_buffer_names(self) -> Set[str]:
+        return {
+            dep.name
+            for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes)
+        }
+    def used_or_aliased_buffer_names(self) -> Set[str]:
+        used_names = set()
+        for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes):
+            used_names.add(dep.name)
+            if V.graph.name_to_buffer.get(dep.name):
+                layout = V.graph.name_to_buffer[dep.name].get_layout()
+                # needed to avoid deallocating aliased buffer
+                # if there are still uses of aliases ahead
+                if isinstance(layout, ir.AliasedLayout):
+                    used_names.add(layout.view.data.get_name())
+        return used_names
+    def prune_deps(self):
+        self.unmet_dependencies = {
+            dep
+            for dep in self.unmet_dependencies
+            if dep.name not in self.scheduler.available_buffer_names
+        }
+    def prune_weak_deps(self):
+        # Prune weak dependencies on buffers that have been removed
+        def should_prune(dep):
+            return isinstance(dep, WeakDep) and dep.name in V.graph.removed_buffers
+        to_remove = {dep for dep in self.read_writes.reads if should_prune(dep)}
+        self.set_read_writes(self.read_writes.remove_reads(to_remove))
+    def prune_redundant_deps(self, name_to_fused_node):
+        _prune_redundant_deps(self, name_to_fused_node)
+    def get_name(self) -> str:
+        return self.node.get_name()
+    def get_first_name(self) -> str:
+        return self.get_name()
+    def get_names(self) -> Set[str]:
+        return {self.get_name()}
+    def get_nodes(self) -> Sequence["BaseSchedulerNode"]:
+        return [self]
+    def get_device(self):
+        return self.node.get_device()
+    def is_reduction(self):
+        return False
+    def is_split_scan(self):
+        return False
+    def is_template(self):
+        return False
+    def is_extern(self):
+        return False
+    def is_foreach(self):
+        return False
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        return False
+    def has_side_effects(self):
+        return False
+    def decide_inplace_update(self):
+        """
+        Decide if there should be inplace updates for the node
+        and record the decision in the active kernel.
+        """
+        if not self.node.should_allocate():
+            return
+        if isinstance(self, (SchedulerNode,)) and (
+            self.node.get_alias_names() or self.node.get_mutation_names()
+        ):
+            return
+        if (
+            (
+                isinstance(self, (SchedulerNode,))
+                # o what have i done.  lets make this an api
+                or (
+                    isinstance(self, ExternKernelSchedulerNode)
+                    and isinstance(self.node, (ir.AllReduce, ir.InPlaceHint))
+                )
+            )
+            and config.inplace_buffers
+            and (
+                not isinstance(V.kernel, torch._inductor.codegen.triton.TritonKernel)
+                or getattr(V.kernel, "mutations", None) is not None
+            )
+        ):
+            from .codegen.wrapper import buffer_reuse_key
+            ordered_reads = sorted(self.read_writes.reads, key=lambda x: x.name)
+            for read in ordered_reads:
+                input_node: Optional[
+                    BaseSchedulerNode
+                ] = self.scheduler.name_to_node.get(read.name)
+                if input_node and V.graph.wrapper_code.can_reuse(input_node, self):
+                    assert input_node.users is not None
+                    remaining_uses = [
+                        x
+                        for x in input_node.users
+                        if x.node.get_name()
+                        not in self.scheduler.available_buffer_names
+                    ]
+                    if (
+                        len(remaining_uses) == 1
+                        and remaining_uses[0].can_inplace
+                        and remaining_uses[0].node is self
+                        and not isinstance(
+                            input_node.node.get_layout(),
+                            (
+                                ir.MultiOutputLayout,
+                                ir.MutationLayout,
+                                ir.AliasedLayout,
+                            ),
+                        )
+                        and not (
+                            isinstance(
+                                input_node.node, (ir.FallbackKernel, ir.MultiOutput)
+                            )
+                            and len(input_node.node.get_alias_names()) > 0
+                        )
+                        and buffer_reuse_key(input_node.node)
+                        == buffer_reuse_key(self.node)
+                    ):
+                        # hacky check for if V.kernel is a real kernel or NullHandler
+                        if hasattr(V.kernel, "args"):
+                            # if there isn't a triton kernel, then we don't need to call triton-specific things.
+                            # but TODO this might be a convenient place to signal to the Collective kernels to inplace
+                            # (and, can we make "kernel" less generic of a name?)
+                            V.kernel.args.make_inplace(
+                                input_node.get_name(), self.get_name()
+                            )
+                            # mutations not tracked in cpp kernels
+                            if isinstance(
+                                V.kernel, torch._inductor.codegen.triton.TritonKernel
+                            ):
+                                V.kernel.mutations.add(input_node.get_name())
+                                V.kernel.mutations.add(self.get_name())
+                            # update last usage of reused node
+                            self.last_usage.discard(input_node.get_name())
+                            V.kernel.inplace_update_buffers[
+                                self.get_name()
+                            ] = input_node.get_name()
+                        break
+    def allocate(self):
+        if not self.node.should_allocate():
+            return
+        if isinstance(self, (SchedulerNode,)) and (
+            self.node.get_alias_names() or self.node.get_mutation_names()
+        ):
+            V.graph.wrapper_code.codegen_allocation(self.node)
+            return
+        # hacky check for if V.kernel is a real kernel or NullHandler
+        if (
+            hasattr(V.kernel, "args")
+            and self.get_name() in V.kernel.inplace_update_buffers
+        ):
+            V.graph.wrapper_code.codegen_inplace_reuse(
+                self.scheduler.name_to_node[
+                    V.kernel.inplace_update_buffers[self.get_name()]
+                ].node,
+                self.node,
+            )
+        else:
+            V.graph.wrapper_code.codegen_allocation(self.node)
+    def can_free(self):
+        # There's no real allocated buffer, no need to free it
+        if isinstance(self.node.layout, ir.NoneLayout):
+            return False
+        for use in self.users:
+            if isinstance(use.node, OutputNode):
+                return False
+        return True
+    def codegen_originating_info(self, buffer, only_once=True):
+        if not config.comment_origin:
+            return
+        if only_once and self.written:
+            return
+        origins = self.node.origins
+        out_lines = []
+        for o in origins:
+            if o.op == "output":
+                # These are boring and samey
+                continue
+            out_lines.append("")
+            # TODO(voz): Should the pragma be constant somewhere?
+            out_lines.append("#pragma CMT ORIGIN:")
+            op_info_str = f"#pragma CMT {o.op} {o.target}"
+            if "seq_nr" in o.meta:
+                op_info_str = op_info_str + f" seq_nr:{o.meta['seq_nr']}"
+            out_lines.append(op_info_str)
+            if "stack_trace" in o.meta:
+                stack_trace = f"{o.meta['stack_trace']}"
+                stack_trace_last_line = stack_trace.split("|")[-1]
+                out_lines.append(
+                    "#pragma CMT "
+                    + stack_trace_last_line.replace("{", "{{")
+                    .replace("}", "}}")
+                    .replace("\n", "\\")
+                )
+                out_lines.append("#pragma CMT END ORIGIN")
+                out_lines.append("")
+        if len(out_lines) == 0:
+            return
+        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+        buffer.writelines(out_lines)
+        self.written = True
+    def get_read_write_buffers_sizes(self) -> int:
+        """
+        Counting the number of bytes accessed for a kernel is
+        surprisingly tricky. In particular, there is a differentiation
+        between 'theoretical' memory accesses and practical memory
+        accesses. For example, a layernorm kernel may actually access an
+        input 3 times, but in theory, it only needs to access its input
+        once (and may be optimized to do so through say, persistent
+        reductions)
+        Another example is that even though a buffer is passed in, we may
+        not access the entire buffer. This may occur if we are accessing
+        a slice of the buffer. Another tricky case is for indirect
+        indexing, where the amount of bytes accessed depends on the
+        values of the input.
+        What this function aims to compute is the memory accesses for
+        worst-case inputs, best-case optimization. What this means is
+        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.
+        1. Numel in ranges multiplied by number of deps the buffer has
+        2. The buffer size
+        """
+        if isinstance(self, NopKernelSchedulerNode):
+            return 0
+        if isinstance(self, ExternKernelSchedulerNode) and isinstance(
+            self.node, MultiOutput
+        ):
+            return 0
+        if isinstance(self, SchedulerNode):
+            node_numel = V.graph.sizevars.size_hint(
+                sympy_product(self.get_ranges()[0])
+                * sympy_product(self.get_ranges()[1])
+            )
+        else:
+            node_numel = int(1e9)
+        buf_accesses = collections.defaultdict(list)
+        for dep in self.read_writes.reads | self.read_writes.writes:
+            buf_accesses[dep.name].append(dep)
+        reads = {dep.name for dep in self.read_writes.reads}
+        writes = {dep.name for dep in self.read_writes.writes}
+        def is_materialized(buf, snodes):
+            users = self.scheduler.name_to_node[buf].users
+            buf_uses = {user.node for user in users}
+            return len(buf_uses - set(snodes)) > 0
+        if isinstance(self, FusedSchedulerNode):
+            removed_buffers = {
+                dep for dep in writes if not is_materialized(dep, self.snodes)
+            }
+            writes = writes - removed_buffers
+            reads = reads - removed_buffers
+        node_bytes = 0
+        for buf_name in reads | writes:
+            buf_accessed_elems = sum([node_numel for dep in buf_accesses[buf_name]])
+            buf: Union[ir.Buffer, ir.TensorBox]
+            if buf_name in V.graph.name_to_buffer:
+                buf = V.graph.name_to_buffer[buf_name]
+            elif buf_name in V.graph.graph_inputs:
+                buf = V.graph.graph_inputs[buf_name]
+            else:
+                continue
+            def get_buf_elems(buf):
+                return V.graph.sizevars.size_hint(sympy_product(buf.get_size()))
+            # Kind of a lazy way to get the MultiOutput nodes corresponding to
+            # a MultiOutputLayout
+            if isinstance(buf.layout, MultiOutputLayout):
+                users = self.scheduler.name_to_node[buf.get_name()].users
+                buf_elems = sum(get_buf_elems(user.node.node) for user in users)
+            else:
+                buf_elems = get_buf_elems(buf)
+            node_bytes += min(buf_elems, buf_accessed_elems) * get_dtype_size(
+                buf.get_dtype()
+            )
+        return node_bytes
+    def get_estimated_runtime(self) -> float:
+        """
+        Returns estimated op runtime in nanoseconds (ns)
+        """
+        layout = None
+        dtype = None
+        if not hasattr(self, "node") or not self.node:
+            assert isinstance(
+                self, (FusedSchedulerNode, ForeachKernelSchedulerNode)
+            ), f"{type(self)=}"
+            assert self.snodes
+            if not self.snodes[0].node:
+                return 0
+            layout = self.snodes[0].node.get_layout()
+            dtype = self.snodes[0].node.get_dtype()
+        else:
+            layout = self.node.get_layout()
+            dtype = self.node.get_dtype()
+        if "cuda" != layout.device.type:
+            # default to no reordering based on runtime
+            return 0
+        # Collective kernels
+        if is_collective(self.node):
+            return estimate_nccl_collective_runtime(self.node)
+        elif is_wait(self.node):
+            # ir.Wait is only used for collective ops.
+            # The time needed for the collective op is already estimated and considered
+            # when we are processing the collective op IR node, so ir.Wait takes 0 time
+            # since it doesn't take extra time to get the result after the collective is completed.
+            return 0
+        try:
+            gpu_memory_bandwidth = get_gpu_dram_gbps()
+            gpu_flops = get_device_tflops(dtype) * 10**12
+        except Exception:
+            return 0
+        if isinstance(self, ExternKernelSchedulerNode):
+            assert isinstance(self.node, ir.ExternKernel), f"{type(self.node)=}"
+            op = kernel_name_to_op.get(
+                getattr(self.node, "python_kernel_name", ""), None
+            )
+            # if there is a resolved op, dry-run using fake mode and record flop count
+            if op is not None:
+                from torch._subclasses.fake_tensor import FakeTensorMode
+                from torch.utils.flop_counter import FlopCounterMode
+                with FakeTensorMode(), FlopCounterMode(
+                    display=False
+                ) as flop_counter_mode:
+                    from .ir import ir_node_to_tensor
+                    fake_inputs = [
+                        ir_node_to_tensor(input, guard_shape=False)
+                        for input in self.node.inputs
+                    ]
+                    cls = self.node.__class__
+                    cls.process_kernel(op, *fake_inputs, **self.node.kwargs)
+                    # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
+                    factor = 1.0
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    counted_bytes = self.get_read_write_buffers_sizes()
+                    compute_time = (factor * counted_flops / gpu_flops) * 1e9
+                    transfer_time = counted_bytes / gpu_memory_bandwidth
+                    # Return estimated runtime in nanoseconds
+                    return max(compute_time, transfer_time)
+        elif isinstance(self, FusedSchedulerNode) or isinstance(
+            self.node, ComputedBuffer
+        ):
+            # Return estimated runtime in nanoseconds (bytes / gbps)
+            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+        return 0
+class ExternKernelSchedulerNode(BaseSchedulerNode):
+    def debug_str_extra(self) -> str:
+        return f"{self.get_name()}.node.kernel = {getattr(self.node, 'python_kernel_name', None)}"
+    def is_extern(self):
+        return True
+    def has_side_effects(self):
+        return hasattr(self.node, "has_side_effects") and self.node.has_side_effects()
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        if self.get_aliases() or self.is_template():
+            return False
+        if read_dep.name not in self.scheduler.name_to_node:
+            # don't allow reuse of an 'input' buffer, we don't own it
+            # (would this have been fixed if I tracked mutations properly above?)
+            return False
+        if not isinstance(
+            self.node, (torch._inductor.ir.AllReduce, torch._inductor.ir.InPlaceHint)
+        ):
+            # TODO make this a property of the IR
+            return False
+        if len(self.read_writes.writes) == 1:
+            write_dep = next(iter(self.read_writes.writes))
+            numel_diff = read_dep.get_numel() - write_dep.get_numel()
+            return V.graph.sizevars.simplify(numel_diff) == 0
+        return False
+class NopKernelSchedulerNode(BaseSchedulerNode):
+    pass
+class SchedulerNode(BaseSchedulerNode):
+    def __init__(
+        self,
+        scheduler: "Scheduler",
+        node: Union[ir.ComputedBuffer, ir.TemplateBuffer],
+    ):
+        super().__init__(scheduler, node)
+        self._compute_attrs()
+    def _compute_attrs(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+        self._sizes, self._body = self.node.simplify_and_reorder(
+            extra_indexing_constraints=extra_indexing_constraints
+        )
+        group_fn = self.scheduler.get_backend(self.node.get_device()).group_fn
+        self.group = (self.node.get_device(), group_fn(self._sizes))
+        if isinstance(self.node, ir.TemplateBuffer):
+            self.set_read_writes(self.node.normalized_read_writes())
+        else:
+            self.set_read_writes(
+                dependencies.extract_read_writes(
+                    self._body, *self._sizes, normalize=True
+                )
+            )
+    def recompute_size_and_body(
+        self, extra_indexing_constraints: Tuple[Dict[Any, Any], List[Any]]
+    ):
+        self._compute_attrs(extra_indexing_constraints=extra_indexing_constraints)
+    def debug_str_extra(self) -> str:
+        name = self.get_name()
+        lines = [
+            f"{name}.group.device = {self.group[0]}",
+            f"{name}.group.iteration = {self.group[1]}",
+            f"{name}.sizes = {self._sizes}",
+        ]
+        if self.get_aliases():
+            lines.append(f"{name}.aliases = {pformat(self.get_aliases())}")
+        if self.get_mutations():
+            lines.append(f"{name}.mutations = {pformat(self.get_mutations())}")
+        if isinstance(self._body, ir.LoopBody):
+            lines.append(f"class {name}_loop_body:")
+            lines.append(textwrap.indent(self._body.debug_str(), "    "))
+        return "\n".join(lines)
+    def get_ranges(self):
+        return self._sizes
+    def is_reduction(self):
+        assert isinstance(
+            self.node, (ir.ComputedBuffer, ir.TemplateBuffer)
+        ), f"{type(self.node)=}"
+        return bool(self.node.get_reduction_type())
+    def is_split_scan(self):
+        assert isinstance(
+            self.node, (ir.ComputedBuffer, ir.TemplateBuffer)
+        ), f"{type(self.node)=}"
+        return isinstance(self.node, ir.ComputedBuffer) and isinstance(
+            self.node.data, ir.SplitScan
+        )
+    def is_template(self):
+        return isinstance(self.node, ir.TemplateBuffer)
+    def get_template_node(self):
+        return self.node if self.is_template() else None
+    def run(self, *index_vars):
+        self.decide_inplace_update()
+        self.mark_run()
+        self.codegen(index_vars)
+    def mark_run(self):
+        self.allocate()
+    def ranges_from_index_vars(self, index_vars):
+        sizes = self._sizes
+        assert sum(map(len, sizes)) == sum(map(len, index_vars))
+        var_ranges = dict(
+            zip(
+                itertools.chain.from_iterable(index_vars),
+                itertools.chain.from_iterable(sizes),
+            )
+        )
+        return var_ranges
+    def codegen(self, index_vars):
+        var_ranges = self.ranges_from_index_vars(index_vars)
+        try:
+            with V.set_ops_handler(
+                SimplifyIndexing(V.get_ops_handler(), var_ranges)
+            ), V.kernel.set_current_node(self):
+                self._body(*index_vars)
+        except Exception:
+            log.fatal("Error in codegen for %s", self.node)
+            raise
+    def pointwise_read_writes(self):
+        """
+        Get the memory dependencies in the non-reduction axis.
+        """
+        sizes, reduction_sizes = self._sizes
+        def fn(index):
+            return self._body(index, [sympy.Integer(0) for _ in reduction_sizes])
+        return dependencies.extract_read_writes(fn, sizes)
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        if self.get_aliases() or self.is_template():
+            return False
+        if len(self.read_writes.writes) == 1 and isinstance(
+            read_dep, dependencies.MemoryDep
+        ):
+            write_dep = next(iter(self.read_writes.writes))
+            assert isinstance(write_dep, dependencies.MemoryDep), f"{type(write_dep)=}"
+            return read_dep.index == write_dep.index and read_dep.size == write_dep.size
+        return False
+    @cache_on_self
+    def _get_atomic_add_buffers(self) -> Set[str]:
+        buffers_store_as_atomic_add = set()
+        if isinstance(self._body, ir.LoopBody):
+            for node in self._body.get_nodes():
+                if (
+                    node.op == "call_method"
+                    and node.target == "store"
+                    and (
+                        ("mode" in node.kwargs and node.kwargs["mode"] == "atomic_add")
+                        or (len(node.args) == 5 and node.args[4] == "atomic_add")
+                    )
+                ):
+                    buffers_store_as_atomic_add.add(
+                        node.kwargs["name"]
+                        if "name" in node.kwargs
+                        else (node.args[1] if len(node.args) >= 2 else "")
+                    )
+        return buffers_store_as_atomic_add
+    def has_atomic_add(self, check_buf):
+        return check_buf in self._get_atomic_add_buffers()
+class FusedSchedulerNode(BaseSchedulerNode):
+    """
+    This is a "fake" scheduler node that represents a group of scheduler nodes
+    that are meant to be fused together. The way it does this is by maintaining
+    its unmet dependencies as the union of its constituent nodes.
+    """
+    @classmethod
+    def fuse(cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        assert node1.scheduler is node2.scheduler
+        assert isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(
+            node2, (SchedulerNode, FusedSchedulerNode)
+        )
+        return cls(node1.scheduler, list(node1.get_nodes()) + list(node2.get_nodes()))  # type: ignore[arg-type]
+    def __init__(self, scheduler: "Scheduler", snodes: List[SchedulerNode]):
+        # NB: No need to call super().__init__() because we don't need to re-use any of its logic.
+        self.snodes = snodes
+        self.scheduler = scheduler
+        self.node: ir.Buffer = None  # type: ignore[assignment]
+        self.users: List[NodeUser] = []
+        self.inverse_users = []
+        self.node_users = []
+        self.group = max(snodes, key=lambda x: int(x.is_reduction())).group
+        self.ancestors = set.union(
+            *[x.ancestors for x in snodes if x.ancestors is not None]
+        )
+        self.set_read_writes(
+            dependencies.ReadWrites.merge_list([x.read_writes for x in snodes])
+        )
+        self.unmet_dependencies = {
+            dep
+            for dep in set.union(*[x.unmet_dependencies for x in snodes])
+            if dep.name not in self.get_names()
+        } - self.read_writes.writes
+        self.min_order = min([x.min_order for x in self.snodes])
+        self.max_order = max([x.max_order for x in self.snodes])
+    @cache_on_self
+    def get_name(self) -> str:
+        return "_".join([x.get_name() for x in self.snodes])
+    def get_first_name(self) -> str:
+        return self.snodes[0].get_name()
+    @cache_on_self
+    def get_names(self) -> Set[str]:
+        return set.union(*[x.get_names() for x in self.snodes])
+    def debug_str_extra(self) -> str:
+        lines = [
+            f"{self.get_name()}.snodes[{i}] =\n{node.debug_str()}"
+            for i, node in enumerate(self.snodes)
+        ]
+        return textwrap.indent("\n".join(lines).rstrip(), "    ")
+    def set_last_usage(
+        self, future_used_buffers: Set[str], mutation_real_name: Dict[str, str]
+    ):
+        # Set self.last_usage using the global information
+        # This will be used for inter-kernel optimisations
+        super().set_last_usage(future_used_buffers, mutation_real_name)
+        # Set self.last_usage on the snodes
+        # This will be used for optimisations within the kernel
+        future_used_buffers: Set[str] = set()
+        for node in reversed(self.snodes):
+            node.set_last_usage(future_used_buffers, mutation_real_name)
+            future_used_buffers.update(node.last_usage)  # type: ignore[arg-type]
+    @cache_on_self
+    def used_buffer_names(self) -> Set[str]:
+        return set.union(*[x.used_buffer_names() for x in self.snodes])
+    @cache_on_self
+    def used_or_aliased_buffer_names(self) -> Set[str]:
+        return set.union(*[x.used_or_aliased_buffer_names() for x in self.snodes])
+    def get_nodes(self) -> List[SchedulerNode]:
+        return self.snodes
+    def __repr__(self):
+        return f"{type(self).__name__}(nodes={self.get_name()})"
+    @cache_on_self
+    def is_reduction(self):
+        return any(x.is_reduction() for x in self.snodes)
+    @cache_on_self
+    def is_split_scan(self):
+        return any(x.is_split_scan() for x in self.snodes)
+    @cache_on_self
+    def is_template(self):
+        return any(x.is_template() for x in self.snodes)
+    @cache_on_self
+    def get_template_node(self):
+        for node in self.snodes:
+            if node.is_template():
+                return node
+        return None
+    def get_device(self):
+        return self.group[0]
+    @cache_on_self
+    def has_aliasing_or_mutation(self):
+        return any(x.has_aliasing_or_mutation() for x in self.snodes)
+    @cache_on_self
+    def op_counts(self):
+        op_counts: Counter[str] = collections.Counter()
+        for node in self.snodes:
+            op_counts.update(node.op_counts())
+        return op_counts
+    def has_atomic_add(self, check_buf):
+        return any(
+            (
+                isinstance(sub_schedule_node1, SchedulerNode)
+                and sub_schedule_node1.has_atomic_add(check_buf)
+            )
+            for sub_schedule_node1 in self.get_nodes()
+        )
+    # None of these need to be implemented, as a FusedSchedulerNode is just an
+    # abstraction for scheduling purposes
+    def update_mutated_names(self, renames: Dict[str, str]):
+        raise NotImplementedError
+    def add_mutation_dep(self, name):
+        raise NotImplementedError
+    def set_users(self, users: List["NodeUser"]):
+        raise NotImplementedError
+    def get_aliases(self):
+        raise NotImplementedError
+    def get_mutations(self):
+        raise NotImplementedError
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        raise NotImplementedError
+    def allocate(self):
+        raise NotImplementedError
+    def can_free(self):
+        raise NotImplementedError
+    def debug_str(self) -> str:
+        """Longer form printout for trace logs"""
+        name = self.get_name()
+        node_typestr = ",".join(type(n).__name__ for n in self.snodes)
+        lines = [
+            f"{name}: {type(self).__name__}({node_typestr})",
+            f"{name}.writes = {pformat(self.read_writes.writes)}",
+            f"{name}.unmet_dependencies = {pformat(self.unmet_dependencies)}",
+            f"{name}.met_dependencies = {pformat(self.read_writes.reads - self.unmet_dependencies)}",
+            f"{name}.users = {self.users}",
+        ]
+        try:
+            lines += [
+                self.debug_str_extra(),
+            ]
+        except Exception:
+            log.warning("Ignoring error in debug_str()", exc_info=True)
+        return "\n".join(lines).rstrip()
+class ForeachKernelSchedulerNode(FusedSchedulerNode):
+    """Scheduler node which consists of a list of scheduler nodes that each operate on a
+    distinct tensor in a list of tensors."""
+    def get_consumer_subnode_for(self, producer):
+        if producer.get_name() in self.read_to_node:
+            return self.read_to_node[producer.get_name()]
+        return None
+    def get_producer_subnode_for(self, consumer):
+        for rd in consumer.read_writes.reads:
+            if rd.name in self.name_to_node:
+                return self.name_to_node[rd.name]
+        return None
+    @classmethod
+    def can_fuse(cls, producer, consumer):
+        why = WhyNoFuse(producer, consumer)
+        if producer.is_foreach() and consumer.is_foreach():
+            foreach_match = len(producer.snodes) == len(consumer.snodes)
+            if not foreach_match:
+                why("foreach do not have same length")
+            return foreach_match and all(
+                producer.scheduler.can_fuse(l, r)
+                for l, r in zip(producer.snodes, consumer.snodes)
+            )
+        elif consumer.is_foreach():
+            consumer_subnode = consumer.get_consumer_subnode_for(producer)
+            if consumer_subnode is not None:
+                return consumer.scheduler.can_fuse(producer, consumer_subnode)
+            why("candidate producer is not dep of any foreach consumer")
+            return False
+        elif producer.is_foreach():
+            producer_subnode = producer.get_producer_subnode_for(consumer)
+            if producer_subnode is not None:
+                return producer.scheduler.can_fuse(producer_subnode, consumer)
+            why("candidate consumer has no dep in any foreach producer")
+            return False
+        raise AssertionError(
+            "At least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node"
+        )
+    @classmethod
+    def fuse(cls, producer, consumer):
+        assert producer.is_foreach() or consumer.is_foreach()
+        prev_node_1 = None
+        prev_node_2 = None
+        if producer.is_foreach() and consumer.is_foreach():
+            fused_nodes = [
+                FusedSchedulerNode.fuse(l, r)
+                for l, r in zip(producer.snodes, consumer.snodes)
+            ]
+        elif producer.is_foreach():
+            producer_subnode = producer.get_producer_subnode_for(consumer)
+            fused_nodes = []
+            prev_node_1 = producer
+            prev_node_2 = None
+            for node in producer.snodes:
+                if node is producer_subnode:
+                    new_node = FusedSchedulerNode.fuse(node, consumer)
+                    prev_node_2 = new_node
+                    fused_nodes.append(new_node)
+                else:
+                    fused_nodes.append(node)
+        elif consumer.is_foreach():
+            consumer_subnode = consumer.get_consumer_subnode_for(producer)
+            fused_nodes = []
+            prev_node_1 = consumer
+            prev_node_2 = None
+            for node in consumer.snodes:
+                if node is consumer_subnode:
+                    new_node = FusedSchedulerNode.fuse(producer, node)
+                    prev_node_2 = new_node
+                    fused_nodes.append(new_node)
+                else:
+                    fused_nodes.append(node)
+        return cls(producer.scheduler, fused_nodes, prev_node_1, prev_node_2)  # type: ignore[possibly-undefined]
+    def __init__(
+        self,
+        scheduler: "Scheduler",
+        nodes: List[SchedulerNode],
+        prev_node_1=None,
+        prev_node_2=None,
+    ):
+        self.read_to_node = {}
+        self.name_to_node = {}
+        if prev_node_1 is None or prev_node_2 is None:
+            super().__init__(scheduler, nodes)
+            for node in nodes:
+                for read in node.read_writes.reads:
+                    self.read_to_node[read.name] = node
+                for name in node.get_names():
+                    self.name_to_node[name] = node
+        else:
+            self.scheduler = scheduler
+            self.snodes = nodes
+            self.node: ir.Buffer = None  # type: ignore[assignment]
+            self.users: List[NodeUser] = []
+            self.set_read_writes(
+                dependencies.ReadWrites.merge_list(
+                    [prev_node_1.read_writes, prev_node_2.read_writes]
+                )
+            )
+            self.unmet_dependencies = {
+                dep
+                for dep in set.union(
+                    prev_node_1.unmet_dependencies, prev_node_2.unmet_dependencies
+                )
+                if dep.name not in self.get_names()
+            } - self.read_writes.writes
+            self.min_order = min([prev_node_1.min_order, prev_node_2.min_order])
+            self.max_order = max([prev_node_1.max_order, prev_node_2.max_order])
+            foreach_node = prev_node_1 if prev_node_1.is_foreach() else prev_node_2
+            other_node = prev_node_2 if prev_node_1.is_foreach() else prev_node_1
+            self.ancestors = foreach_node.ancestors
+            self.ancestors.update(other_node.ancestors)
+            self.name_to_node = foreach_node.name_to_node
+            for name in other_node.get_names():
+                self.name_to_node[name] = other_node
+        self.group = (nodes[0].get_device(), "foreach")
+        self.origins: Set[torch.fx.Node] = set()
+    def mark_run(self):
+        raise NotImplementedError
+    def codegen(self):
+        assert isinstance(self.node, ir.ComputedBuffer), f"{type(self.node)=}"
+        self.node.get_store_function()(self.node.make_loader()())
+    def can_free(self):
+        return NotImplementedError
+    def is_foreach(self):
+        return True
+    def get_subkernel_nodes(self):
+        """Returns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
+        These nodes may be vertically fused."""
+        return list(self.snodes)
+    def get_nodes(self):
+        """Returns all nodes contained in this kernel, unpacking fused nodes into their constituent scheduler nodes."""
+        return list(itertools.chain.from_iterable(x.get_nodes() for x in self.snodes))
+    def get_first_name(self):
+        return self.snodes[0].get_first_name()
+    def prune_redundant_deps(self, name_to_fused_node):
+        _prune_redundant_deps(self, name_to_fused_node)
+        for node in self.snodes:
+            node.prune_redundant_deps(name_to_fused_node)
+def pick_loop_order(stride_lengths, sizes, priority_idx=()):
+    """
+    A heuristic to decide loop iteration orders.  This has not been well
+    tuned and may be something we should autotune.
+    """
+    @functools.cmp_to_key
+    def index_cmp(a, b):
+        if sizes[a] == 1 or sizes[b] == 1:
+            # 1-sizes don't matter, just move them to the end
+            return cmp(sizes[a] == 1, sizes[b] == 1)
+        stride_len_a = [sl[a] for sl in stride_lengths]
+        stride_len_b = [sl[b] for sl in stride_lengths]
+        # equivalent to
+        # np.logical_or(stride_lengths[:, b] == 0, stride_lengths[:, a] < stride_lengths[:, b]).all()
+        a_first = sum(
+            sl_b == 0 or sl_a < sl_b for sl_a, sl_b in zip(stride_len_a, stride_len_b)
+        )
+        b_first = sum(
+            sl_a == 0 or sl_b < sl_a for sl_a, sl_b in zip(stride_len_a, stride_len_b)
+        )
+        if a_first > b_first:
+            return -1
+        if b_first > a_first:
+            return 1
+        # otherwise contiguous
+        return cmp(b, a)
+    order = list(reversed(range(len(stride_lengths[0]))))
+    if len(priority_idx) > 0:
+        # if we have priority node, only use that node's order
+        stride_lengths = [stride_lengths[pi] for pi in priority_idx]
+    if config.pick_loop_orders:
+        order.sort(key=index_cmp)
+    return order
+@dataclasses.dataclass
+class NodeUser:
+    node: BaseSchedulerNode
+    can_inplace: bool = False
+    # A weak user must be scheduled after a given node, but doesn't actually
+    # use the result
+    is_weak: bool = False
+    def __hash__(self):
+        return hash((self.node.get_name(), self.can_inplace, self.is_weak))
+    def __eq__(self, other):
+        return (
+            self.get_name() == other.get_name()
+            and self.can_inplace == other.can_inplace
+            and self.is_weak == other.is_weak
+        )
+    def get_name(self):
+        return self.node.get_name()
+    def merge(self, other: "NodeUser") -> "NodeUser":
+        assert self.node is other.node
+        return NodeUser(
+            self.node,
+            self.can_inplace and other.can_inplace,
+            self.is_weak and other.is_weak,
+        )
+_post_grad_graph_counter = itertools.count()
+class Scheduler:
+    @dynamo_timed
+    def __init__(self, nodes):
+        super().__init__()
+        self.backends = {}
+        self.fuse_cache = {}
+        self.post_grad_graph_id = next(_post_grad_graph_counter)
+        self.nodes = []
+        self.available_buffer_names = {
+            *V.graph.graph_inputs.keys(),
+            *V.graph.constants.keys(),
+        }
+        self.nodes = [self.create_scheduler_node(n) for n in nodes]
+        # some new constants could have been created above
+        self.available_buffer_names.update(V.graph.constants.keys())
+        for node in self.nodes:
+            node.prune_deps()
+        self.name_to_node: Dict[str, BaseSchedulerNode] = {
+            n.get_name(): n for n in self.nodes
+        }
+        self.name_to_fused_node: Dict[
+            str, BaseSchedulerNode
+        ] = dict()  # set in fuse_nodes()
+        # mutation_real_name: Maps back to the original name for codegen
+        # Example:
+        # If you mutate buf0 inside of buf1's kernel, then:
+        # mutation_real_name = {"buf0" : "buf1"}
+        # all subsequent uses of buf0 become buf1's usage in dependency graph
+        self.mutation_real_name = {}
+        # We handle mutation by renaming modified versions of the same
+        # buffer in the dependency graph to prevent cycles.
+        # mutation_renames: tracks the current name for a given buffer
+        #                   (changed once per mutation)
+        # Example:
+        # If you mutate buf0 inside of buf1's kernel, then:
+        # mutation_renames = {"buf1" : "buf0"}
+        # in codegen we only use buf0, never buf1
+        self.mutation_renames = {}
+        self.compute_dependencies()
+        self.topological_sort_schedule()
+        self.dead_node_elimination()
+        if config.reorder_for_compute_comm_overlap:
+            comms.decide_global_ordering_of_comms(self.nodes)
+        self.compute_ancestors()
+        metrics.ir_nodes_pre_fusion += len(self.nodes)
+        V.debug.ir_pre_fusion(self.nodes)
+        self.num_orig_nodes = len(self.nodes)
+        self.name_to_fused_node = {n.get_name(): n for n in self.nodes}
+        self.create_foreach_nodes()
+        self.topological_sort_schedule()
+        self.logged_slow_fusion = set()
+        self.fuse_nodes()
+        if config.reorder_for_compute_comm_overlap:
+            # Refresh node_users and inverse_users to reflect fused nodes
+            self.compute_node_users()
+            self.nodes = comms.reorder_compute_and_comm_for_overlap(self.nodes)
+        self.compute_last_usage()
+        V.debug.ir_post_fusion(self.nodes)
+        V.debug.graph_diagram(self.nodes)
+        self.debug_draw_graph()
+        # used during codegen:
+        self.current_device: torch.device = None  # type: ignore[assignment]
+        self.buffer_names_to_free = set()
+        # fx graph node to the position it appears in the graph
+        # for debug attribution
+        self.origin_to_index = {}
+        get_metric_table("graph_stats").add_row(
+            lambda: {
+                "graph_id": self.post_grad_graph_id,
+                "num_nodes_before_fusion": self.num_orig_nodes,
+                "num_nodes_after_fusion": len(self.nodes),
+            }
+        )
+    def debug_draw_graph(self):
+        """Generate an image of the graph for debugging"""
+        if os.environ.get("INDUCTOR_WRITE_SCHEDULER_GRAPH", None) == "1":
+            from .debug import draw_buffers
+            draw_buffers(self.nodes, print_graph=True)
+    def debug_print_nodes(self, label):
+        if log.isEnabledFor(logging.INFO):
+            log.info("%s:", label)
+            for node in self.nodes:
+                node.log_details()
+    def create_scheduler_node(self, node):
+        assert (
+            node.origins is not None
+        ), "All nodes passed to scheduling must have an origin"
+        if node.is_no_op():
+            return NopKernelSchedulerNode(self, node)
+        elif isinstance(node, (ir.ComputedBuffer, ir.TemplateBuffer)):
+            return SchedulerNode(self, node)
+        elif isinstance(node, ir.ExternKernel):
+            return ExternKernelSchedulerNode(self, node)
+        else:
+            raise NotImplementedError(node)
+    def create_foreach_nodes(self):
+        removed_node_names = set()
+        fe_nodes = []
+        kept_node_names = self.name_to_fused_node.keys()
+        for names in V.graph.lists.values():
+            names = [
+                name
+                for name in names
+                if name in kept_node_names
+                and not isinstance(self.name_to_node[name], NopKernelSchedulerNode)
+            ]
+            if not names:
+                # All nodes eliminated
+                continue
+            removed_node_names.update(names)
+            snodes = [self.name_to_node[name] for name in names]
+            fe_node = ForeachKernelSchedulerNode(self, snodes)  # type: ignore[arg-type]
+            fe_nodes.append(fe_node)
+            for name in names:
+                self.name_to_fused_node[name] = fe_node
+        self.nodes = [
+            node for node in self.nodes if node.get_name() not in removed_node_names
+        ] + fe_nodes
+    def compute_dependencies(self):
+        """
+        Create dependency edges between nodes, handling aliasing and
+        mutation properly.
+        """
+        T = TypeVar("T")
+        class DedupList(Generic[T]):
+            """
+            This data structure behaves like a list except it makes sure the
+            elements remain unique.
+            Normally one could use a set/dict for this purpose however
+            the list in question gets elements appended as it is being
+            iterated over which means that we need to keep the list
+            semantics.
+            """
+            def __init__(self, items=None, membership=None):
+                self.items = items or list()
+                self.membership = membership or set()
+            def append(self, node_user: T) -> None:
+                if node_user in self.membership:
+                    return
+                self.items.append(node_user)
+                self.membership.add(node_user)
+            def __add__(self, other: "DedupList[T]") -> "DedupList[T]":
+                new_membership = set.union(self.membership, other.membership)
+                new_items = self.items + [
+                    x for x in other.items if x not in self.membership
+                ]
+                return DedupList(new_items, new_membership)
+        name_to_users: DefaultDict[str, DedupList[NodeUser]] = collections.defaultdict(
+            DedupList
+        )
+        # handle aliasing by using python aliasing in name_to_users
+        # if foo aliases bar then we will make name_to_users["foo"] point
+        # to the same python list as name_to_users["bar"]
+        for node1 in self.nodes:
+            node1_name = node1.get_name()
+            for node2_name in node1.get_aliases():
+                if node1_name in name_to_users and node2_name in name_to_users:
+                    # merge the two
+                    list1 = name_to_users[node1_name]
+                    list2 = name_to_users[node2_name]
+                    combined = list1 + list2
+                    for key in name_to_users.keys():
+                        if name_to_users[key] is list1 or name_to_users[key] is list2:
+                            name_to_users[key] = combined
+                elif node1_name in name_to_users:
+                    name_to_users[node2_name] = name_to_users[node1_name]
+                else:
+                    name_to_users[node1_name] = name_to_users[node2_name]
+        def rename(n):
+            if n in self.mutation_renames:
+                return rename(self.mutation_renames[n])
+            return n
+        def dep_closure(node_name):
+            reachable_names = {node_name}
+            node = self.name_to_node[node_name]
+            write_dep = next(iter(node.read_writes.writes))
+            for read_dep in node.read_writes.reads:
+                if (
+                    read_dep.name in self.name_to_node
+                    and isinstance(read_dep, dependencies.MemoryDep)
+                    and isinstance(write_dep, dependencies.MemoryDep)
+                    and read_dep.index == write_dep.index
+                    and read_dep.size == write_dep.size
+                ):
+                    reachable_names.update(dep_closure(read_dep.name))
+            return reachable_names
+        def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
+            name_to_users[rename(used_by_name)].append(
+                NodeUser(user_node, can_inplace, is_weak)
+            )
+        unbacked_symbol_to_origin_node = {}
+        for node in self.nodes:
+            log.debug("scheduling %s", node.node)
+            # unbacked symbols don't follow ordinary buffer dependencies, so
+            # we track their def/uses separately
+            unbacked_symbol_defs = sorted(
+                node.node.get_unbacked_symbol_defs(), key=lambda x: x.name
+            )
+            for s in unbacked_symbol_defs:
+                assert isinstance(s, sympy.Symbol)
+                # Pick the first definer as canonical.  There may be multiple
+                # because if a MultiOutputLayout buffer propagates an unbacked
+                # symint to multiple outputs, they will all claim to def it.
+                if s not in unbacked_symbol_to_origin_node:
+                    unbacked_symbol_to_origin_node[s] = node
+            unbacked_symbol_uses = sorted(
+                node.node.get_unbacked_symbol_uses(), key=lambda x: x.name
+            )
+            # if a kernel takes unbacked symints, register dependencies
+            for s in unbacked_symbol_uses:
+                assert (
+                    s in unbacked_symbol_to_origin_node
+                ), f"{s} not in {unbacked_symbol_to_origin_node}"
+                node.add_fake_dep(StarDep(unbacked_symbol_to_origin_node[s].get_name()))
+            # a node will mutate either 0 or 1 buffers
+            assert len(node.get_mutations()) <= 1
+            for alt_name in node.get_mutations():
+                alt_name = rename(alt_name)
+                # this node must run after the prior writer
+                add_user(alt_name, node)
+                node.add_mutation_dep(StarDep(alt_name))
+                for other_node in name_to_users[alt_name].items:
+                    # this node must run after all prior readers
+                    other_name = rename(other_node.get_name())
+                    known_dep_node_names = dep_closure(node.get_name())
+                    if other_name not in known_dep_node_names:
+                        # If this node already directly or indirectly depends on other_node,
+                        # we don't need to insert an extra dep.
+                        node.add_mutation_dep(WeakDep(other_name))
+                        add_user(other_name, node, is_weak=True)
+            # add normal non-mutation dependencies
+            for read in node.read_writes.reads:
+                is_weak = isinstance(read, WeakDep)
+                add_user(read.name, node, node.can_inplace(read), is_weak)
+            node.update_mutated_names(self.mutation_renames)
+            # update our renaming scheme for the next iteration
+            for alt_name in node.get_mutations():
+                self.mutation_renames[rename(alt_name)] = node.get_name()
+                self.mutation_renames[alt_name] = node.get_name()
+                self.mutation_real_name[node.get_name()] = self.mutation_real_name.get(
+                    alt_name, alt_name
+                )
+        # make sure outputs aren't dead-code-eliminated
+        for node_name in V.graph.get_output_names():
+            log.debug("scheduling output %s", node_name)
+            add_user(node_name, OutputNode(StarDep(node_name)))
+        # make sure unbacked symints aren't dead-code-eliminated
+        for node in V.graph.graph_outputs:
+            for s in node.get_unbacked_symbol_uses():
+                assert (
+                    s in unbacked_symbol_to_origin_node
+                ), f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
+                node_name = unbacked_symbol_to_origin_node[s].node.name
+                log.debug("scheduling output %s for unbacked symint %s", node_name, s)
+                add_user(node_name, OutputNode(StarDep(node_name)))
+        # make sure input mutation isn't dead-code-eliminated
+        for name in self.mutation_renames:
+            if name in V.graph.graph_inputs:
+                add_user(name, OutputNode(StarDep(name)))
+                V.graph.mutated_inputs.add(name)
+        inp_names = {
+            name: index for index, name in enumerate(V.graph.graph_inputs.keys())
+        }
+        V.graph.mutated_input_idxs = [
+            inp_names[name] for name in V.graph.mutated_inputs
+        ]
+        # copy users information onto the nodes
+        for node in self.nodes:
+            node.set_users(name_to_users[node.get_name()].items)
+        # populate inverse_users
+        for node in self.nodes:
+            for user in node.users:
+                user.node.inverse_users.append(node)
+    def compute_node_users(self):
+        # set up buffer name to (fused)snode mapping
+        buf_to_snode = {}
+        for node in self.nodes:
+            if isinstance(node, FusedSchedulerNode):
+                for x in node.snodes:
+                    buf_to_snode[x.get_name()] = node
+            buf_to_snode[node.get_name()] = node
+        for node in self.nodes:
+            node.node_users = []
+            node.inverse_users = []
+        # compute inverse_users
+        for node in self.nodes:
+            inverse_users = []
+            for dep in node.unmet_dependencies:
+                assert dep.name in buf_to_snode
+                dep_node = buf_to_snode[dep.name]
+                inverse_users.append(dep_node)
+            node.inverse_users = inverse_users
+        # compute node_users
+        # TODO: ideally, we should deduplicate .users and .node_users,
+        # but currently .users contains extra information that's difficult to
+        # extract into a standalone container.
+        node_to_users: Dict[BaseSchedulerNode, List[BaseSchedulerNode]] = {}
+        for node in self.nodes:
+            for inverse_user in node.inverse_users:
+                node_to_users.setdefault(inverse_user, []).append(node)
+        for node, users in node_to_users.items():
+            node.node_users = users
+    def dead_node_elimination(self):
+        """
+        Remove any nodes without users
+        """
+        again = True  # repeat until a fixed point
+        while again:
+            updated_nodes = []
+            for node in self.nodes:
+                def can_eliminate_user(user: NodeUser):
+                    return user.is_weak or user.get_name() in V.graph.removed_buffers
+                can_eliminate = not node.has_side_effects() and all(
+                    can_eliminate_user(u) for u in node.users
+                )
+                if not can_eliminate:
+                    updated_nodes.append(node)
+                else:
+                    # dead code
+                    log.debug("removed dead node: %s", node.get_name())
+                    V.graph.removed_buffers.add(node.get_name())
+            again = len(self.nodes) > len(updated_nodes)
+            self.nodes = updated_nodes
+        # Prune any WeakDeps no longer needed
+        for node in self.nodes:
+            node.prune_weak_deps()
+    def topological_sort_schedule(self):
+        """
+        Ensure self.nodes is in topologically sorted order
+        """
+        seen: Set[ir.Buffer] = set()
+        name_to_node: Dict[str, ir.Buffer] = dict()
+        result: List[ir.Buffer] = []
+        def visit(n):
+            if n not in seen:
+                seen.add(n)
+                for dep in sorted(n.unmet_dependencies, key=lambda d: d.name):
+                    visit(name_to_node[dep.name])
+                result.append(n)
+        for node in self.nodes:
+            for name in node.get_names():
+                name_to_node[name] = node
+        for node in self.nodes:
+            visit(node)
+        self.nodes = result
+    def compute_ancestors(self):
+        """
+        Populate each node.ancestors
+        """
+        # note self.nodes is topologically sorted
+        name_to_ancestors: Dict[str, Set[str]] = {}
+        for node in self.nodes:
+            ancestors = set()
+            for dep in node.unmet_dependencies:
+                ancestors.add(dep.name)
+                ancestors |= name_to_ancestors[dep.name]
+            name_to_ancestors[node.get_name()] = ancestors
+            node.ancestors = ancestors
+        for order, node in enumerate(self.nodes):
+            node.min_order = order
+            node.max_order = order
+    def fuse_nodes(self):
+        """
+        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
+        """
+        for i in range(10):
+            old_len = len(self.nodes)
+            fusion_log.debug(
+                "===== attempting fusion (%d/10): %d nodes =====", i + 1, old_len
+            )
+            self.fuse_nodes_once()
+            new_len = len(self.nodes)
+            fusion_log.debug(
+                "completed fusion round (%d/10): fused %d nodes into %d nodes\n",
+                i + 1,
+                old_len,
+                new_len,
+            )
+            if new_len == old_len or new_len == 1:
+                fusion_log.debug("===== fusion complete (%d iterations) =====", i + 1)
+                break
+    def benchmark_fused_nodes(self, nodes):
+        """
+        Benchmark fused list of nodes and return the execution time
+        in milliseconds on randomly generated inputs.
+        """
+        assert len(nodes) > 0
+        device = nodes[0].get_device()
+        V.graph.scheduler = self
+        self.current_device = device
+        backend = self.get_backend(device)
+        return backend.benchmark_fused_nodes(nodes)
+    def speedup_by_fusion(self, node1, node2):
+        """
+        If config.benchmark_fusion is False, always return True.
+        Otherwise, return True if fusion can brings speedup.
+        """
+        if not config.benchmark_fusion:
+            return True
+        if (
+            node1.is_template()
+            and not isinstance(node1.get_template_node(), ir.TritonTemplateBuffer)
+            or node1.is_foreach()
+            or node2.is_foreach()
+        ):
+            # TODO support benchmarking epilogue fusion
+            return True
+        node_list_1 = node1.get_nodes()
+        device = node_list_1[0].get_device()
+        # don't support benchmark fusion for CPU right now.
+        if device.type == "cpu":
+            return True
+        node_list_2 = node2.get_nodes()
+        node_list_fused = node_list_1 + node_list_2
+        # We can not accurately benchmark kernel using atomic_add
+        # due to how we generate random integer inputs.
+        # Skip benchmarking them by allowing fusion.
+        if any(
+            hasattr(n.node, "data")
+            and hasattr(n.node.data, "scatter_mode")
+            and n.node.data.scatter_mode == "atomic_add"
+            for n in node_list_fused
+        ):
+            return True
+        from triton.compiler.errors import CompilationError
+        why = WhyNoFuse(node1, node2)
+        try:
+            ms1, path1 = self.benchmark_fused_nodes(node_list_1)
+            if math.isinf(ms1):
+                why("register spilling of the first kernel")
+                return False
+            ms2, path2 = self.benchmark_fused_nodes(node_list_2)
+            if math.isinf(ms2):
+                why("register spilling of the second kernel")
+                return False
+            ms_fused, path_fused = self.benchmark_fused_nodes(node_list_fused)
+            if math.isinf(ms_fused):
+                why("register spilling of the fused kernel")
+                return False
+        except CompilationError as e:
+            # workaround triton issue: https://github.com/openai/triton/issues/2151
+            if "Loop-carried variable" in str(e):
+                return True  # allow fusion
+            else:
+                raise
+        if fusion_log.isEnabledFor(logging.DEBUG):
+            if ms_fused < ms1 + ms2:
+                fusion_log.debug(
+                    "can fuse (benchmark): fusing %s with %s cause %sx speedup",
+                    node1.get_names(),
+                    node2.get_names(),
+                    green_text(f"{(ms1 + ms2) / ms_fused:.3f}"),
+                )
+            else:
+                fusion_log.debug(
+                    "cannot fuse (benchmark): fusing %s with %s cause %sx slowdown",
+                    node1.get_names(),
+                    node2.get_names(),
+                    red_text(f"{ms_fused / (ms1 + ms2):.3f}"),
+                )
+        if (
+            is_metric_table_enabled("slow_fusion")
+            and ms_fused >= ms1 + ms2
+            and (path1, path2) not in self.logged_slow_fusion
+        ):
+            self.logged_slow_fusion.add((path1, path2))
+            get_metric_table("slow_fusion").add_row(
+                lambda: {
+                    "kernel1_path": path1,
+                    "kernel1_latency": ms1,
+                    "kernel2_path": path2,
+                    "kernel2_latency": ms2,
+                    "fused_kernel_path": path_fused,
+                    "fused_kernel_latency": ms_fused,
+                    "slow_down_ratio": ms_fused / (ms1 + ms2),
+                }
+            )
+        return ms_fused < ms1 + ms2
+    def fuse_nodes_once(self):
+        """
+        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
+        This relies on two key functions to control the logic:
+            - self.can_fuse(): checks if a fusion is legal
+            - self.score_fusion(): assigns priority to a given fusion
+        """
+        fused_nodes = set(self.nodes)
+        for node1, node2 in self.get_possible_fusions():
+            node1 = self.name_to_fused_node[node1.get_first_name()]
+            node2 = self.name_to_fused_node[node2.get_first_name()]
+            if self.can_fuse(node1, node2) and not self.will_fusion_create_cycle(
+                node1, node2
+            ):
+                if not self.speedup_by_fusion(node1, node2):
+                    continue
+                fusion_log.debug(
+                    "fusing %s with %s", node1.get_name(), node2.get_name()
+                )
+                # above can_fuse asserts that node2 has the same device
+                device = node1.get_device()
+                node3 = self.get_backend(device).fuse(node1, node2)
+                fused_nodes.remove(node1)
+                fused_nodes.remove(node2)
+                fused_nodes.add(node3)
+                self.name_to_fused_node.update(
+                    {n.get_name(): node3 for n in node3.get_nodes()}
+                )
+        self.nodes = sorted(fused_nodes, key=lambda x: x.min_order)
+        self.topological_sort_schedule()
+        self.prune_redundant_deps()
+    def prune_redundant_deps(self):
+        for node in self.nodes:
+            node.prune_redundant_deps(self.name_to_fused_node)
+    def get_possible_fusions(self):
+        """
+        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
+        """
+        possible_fusions = []
+        seen = set()
+        def check_all_pairs(nodes):
+            for node1_index, node1 in enumerate(nodes):
+                for node2 in nodes[node1_index + 1 :]:
+                    key = (node1, node2)
+                    if key in seen:
+                        continue
+                    seen.add(key)
+                    if self.can_fuse(node1, node2):
+                        possible_fusions.append(key)
+                    elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
+                        node2, node1
+                    ):
+                        # foreach fusions and epilogue fusions are order dependent
+                        possible_fusions.append((node2, node1))
+        buffer_names_grouping = collections.defaultdict(list)
+        for node in self.nodes:
+            for buf in node.used_buffer_names():
+                buffer_names_grouping[buf].append(node)
+        for node_grouping in buffer_names_grouping.values():
+            check_all_pairs(node_grouping)
+        if config.aggressive_fusion:
+            group_grouping = collections.defaultdict(list)
+            for node in self.nodes:
+                group = getattr(node, "group", None)
+                if group:
+                    group_grouping[group].append(node)
+            for node_grouping in group_grouping.values():
+                check_all_pairs(node_grouping)
+        possible_fusions.sort(key=self.score_fusion_key, reverse=True)
+        fusion_log.debug("found %d possible fusions", len(possible_fusions))
+        return possible_fusions
+    def will_fusion_create_cycle(self, node1, node2):
+        """
+        Finds whether there's a path from node1 to node2 (or vice-versa)
+        caused indirectly by other fusions.
+        """
+        def found_path(node):
+            # only fused nodes can introduce new ancestors.
+            if isinstance(node, FusedSchedulerNode) and node not in visited:
+                visited.add(node)
+                if node.get_names().issubset(combined_ancestors):
+                    # All fusion outputs are in ancestors of node1 and node2, thus
+                    # cannot introduce new path:
+                    #
+                    # 1. if output is neither descendent of node1 or node2, the
+                    #        output cannot introduce a path
+                    # 2. due to [can_fuse]: if WLOG output is descendent of node1, it cannot be
+                    #        on path(node1->node2), hence it cannot be ancestor of node2
+                    # 3. due to [acyclic]: if WLOG output is descendent of node1, it cannot be
+                    #        ancestor of node1
+                    return False
+                else:
+                    # continue DFS of new ancestors introduced by the fusion
+                    return bool(combined_names & node.ancestors) or any(
+                        found_path(self.name_to_fused_node[n])
+                        for n in node.ancestors - combined_ancestors
+                    )
+            return False
+        visited = set()
+        combined_names = node1.get_names() | node2.get_names()
+        combined_ancestors = (node1.ancestors | node2.ancestors) - combined_names
+        cycle = any(found_path(self.name_to_fused_node[n]) for n in combined_ancestors)
+        if cycle:
+            WhyNoFuse(node1, node2)("will create cycle")
+        return cycle
+    def can_fusion_increase_peak_memory(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ):
+        """
+        This function prevents fusion for nodes that can increase memory
+        footprint. This problem is more common in horizontal fusion, where nodes
+        that are far apart in the original order get fused, lengthening the live
+        intervals of tensors. This is very evident in models with activation
+        checkpointing, where the recomputed nodes from different checkpointed
+        regions get fused and significantly increase the memory footprint.
+        The current attempt is a quick, possibly hacky, heuristic to prevent the
+        fusion of nodes that are far away in the original order.
+        A better but difficult to implement heurisitic would be to use live
+        intervals of the buffers, find region of peak pressure in the original
+        program and prevent fusion that crosses that peak region. We might need
+        special care or good approximation in this implementation, as fusion of
+        node changes live intervals, and re-computing live intervals and peak
+        memory after each fusion can introduce large compilation overhead.
+        """
+        proximity_score = max(
+            abs(node1.min_order - node2.max_order),
+            abs(node2.min_order - node1.max_order),
+        )
+        return proximity_score > 64
+    def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Determine if it is possible to combine node1 and node2 into a
+        single fused node.
+        """
+        if node1 is node2:
+            return False
+        why = WhyNoFuse(node1, node2)
+        if (
+            isinstance(node1, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
+            and not node1.is_template()
+        ):
+            why("node1 is extern or nop")
+            return False
+        if (
+            isinstance(node2, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
+            and not node2.is_template()
+        ):
+            why("node2 is extern or nop")
+            return False
+        if node2.get_names() & node1.ancestors:
+            why("node1 must go before node2")
+            return False
+        if (
+            isinstance(node1, (FusedSchedulerNode, SchedulerNode))
+            and isinstance(node2, SchedulerNode)
+            and isinstance(node2._body, ir.LoopBody)
+        ):
+            # Fix issue: https://github.com/pytorch/pytorch/issues/108963
+            # Check:
+            #   If node2 reads a buf which is a mutation buf of node1(SchedulerNode) or among nodes in node1(FusedSchedulerNode),
+            #   we will get the corresponding mutation buf and check if this mutation buf is stored by atomic_add mode.
+            # If True, we will disable the fusion of node1 and node2.
+            if any(
+                (
+                    node2_used_buf in self.mutation_renames
+                    and node1.has_atomic_add(self.mutation_renames[node2_used_buf])
+                )
+                for node2_used_buf in node2._body.reads_name2expr.keys()
+            ):
+                return False
+        if node2.is_template():
+            why("templates can only fuse epilogues")
+            return False
+        if node1.is_template() and (
+            node2.has_aliasing_or_mutation()
+            or node2.is_reduction()
+            or not config.epilogue_fusion
+        ):
+            why("template epilogue not satisfied")
+            return False
+        device = node1.get_device()
+        device2 = node2.get_device()
+        if device != device2:
+            why("device mismatch (%s vs %s)", device, device2)
+            return False
+        del device2
+        no_shared_data = self.score_fusion_memory(node1, node2) == 0
+        if no_shared_data and (
+            not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
+        ):
+            why("no shared data")
+            return False  # heuristic not needed for correctness
+        if (
+            not node1.is_foreach()
+            and not node2.is_foreach()
+            and len(node1.get_nodes()) + len(node2.get_nodes()) > config.max_fusion_size
+        ):
+            why("exceeds max fusion")
+            return False  # heuristic not needed for correctness
+        if node1.get_names() & node2.ancestors:
+            # node2 depends on node1 outputs
+            if not self.can_fuse_vertical(node1, node2):
+                return False
+            return self.get_backend(device).can_fuse_vertical(node1, node2)
+        else:  # nodes don't depend on each other, but may have common reads
+            if self.can_fusion_increase_peak_memory(node1, node2):
+                why("will increase peak memory")
+                return False
+            return self.get_backend(device).can_fuse_horizontal(node1, node2)
+    def can_fuse_vertical(self, node1, node2):
+        """
+        Check if it is legal to fuse a consumer (node2) into a producer (node1).
+        We can fuse them if all the reads of node2 either match
+        corresponding writes in node1, or are written by nodes that can
+        be scheduled before the fusion of node1 and node2.
+        We also disable fusion of a write subsequent to a read if the reads
+        and writes do not align.
+        """
+        node1_names = node1.get_names()
+        computed_deps = set()
+        why = WhyNoFuse(node1, node2)
+        # StarDep doesn't match MemoryDep, different indices don't match
+        # However, broadcasting sometimes strips dimensions, and if that's the case
+        # we still can match unmet dep
+        # if there's indirect indexing, don't match it
+        def fusable_read_and_write(read: Dep, write: Dep):
+            return (
+                self.mutation_renames.get(read.name, read.name) == write.name
+                and (isinstance(read, MemoryDep) and isinstance(write, MemoryDep))
+                and not free_symbol_has(read.index, "tmp")
+                and not free_symbol_has(write.index, "tmp")
+                and read.index == write.index
+                and len(read.size) >= len(write.size)
+                and read.size[: len(write.size)] == write.size
+            )
+        for rd in node2.unmet_dependencies:
+            for cd in node1.read_writes.writes:
+                if fusable_read_and_write(rd, cd):
+                    computed_deps.add(rd)
+        remaining_deps = {dep.name for dep in node2.unmet_dependencies - computed_deps}
+        if remaining_deps & node1_names:
+            # MemoryDeps didn't match and read different locations of the same buffer.
+            # Examples here include:
+            #   - MemoryDep("foo", x) != MemoryDep("foo", x + 1)
+            #   - MemoryDep("foo", x) != StarDep("foo")
+            why("memory deps did not match")
+            return False
+        for name in remaining_deps:
+            if node1_names & self.name_to_fused_node[name].ancestors:
+                why("intermediate nodes between node1 & node2")
+                return False
+        # similar to can_inplace, if we are going to fuse a write subsequent to a read
+        # require that the indexing and size is the same
+        for write in node2.read_writes.writes:
+            for read in node1.read_writes.reads:
+                if write.name != self.mutation_renames.get(read.name, read.name):
+                    continue
+                # bail on StarDep
+                if not fusable_read_and_write(read=read, write=write):
+                    why("fusing a write into a read with different indexing formula")
+                    return False
+        return True
+    def score_fusion(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Assign a score (higher comes first) to the fusion of node1
+        and node2.  When different fusions conflict with each other,
+        this is the way we decide what order to run them in.
+        Our current score is based on:
+        - Estimate of the saved memory operations
+        - Fusions closer together in original order
+        """
+        memory_score = self.score_fusion_memory(node1, node2)
+        proximity_score = -max(
+            abs(node1.min_order - node2.max_order),
+            abs(node2.min_order - node1.max_order),
+        )
+        return (
+            node1.is_template() == config.epilogue_fusion_first and memory_score > 0,
+            node1.is_reduction() == node2.is_reduction() and memory_score > 0,
+            memory_score,
+            proximity_score,
+        )
+    def score_fusion_memory(self, node1, node2):
+        """
+        The first term in our fusion score that estimates number of saved memory operations.
+        """
+        common_memory_deps = (node1.read_writes.reads | node1.read_writes.writes) & (
+            node2.read_writes.reads | node2.read_writes.writes
+        )
+        common_memory_deps = {
+            dep for dep in common_memory_deps if not dep.has_unbacked_symbols()
+        }
+        return sum(dep.numbytes_hint() for dep in common_memory_deps)
+    def score_fusion_key(self, nodes):
+        """
+        Shim for list.sort(key=...)
+        """
+        node1, node2 = nodes
+        return self.score_fusion(node1, node2)
+    def compute_last_usage(self):
+        """
+        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
+        """
+        future_used_buffers = set()
+        for node_name in V.graph.get_output_names():
+            future_used_buffers.add(node_name)
+        for node in reversed(self.nodes):
+            node.set_last_usage(future_used_buffers, self.mutation_real_name)
+            future_used_buffers.update(node.last_usage)
+    def free_buffers(self):
+        """Free any buffers that are no longer needed"""
+        for name in sorted(
+            self.buffer_names_to_free
+            - V.graph.removed_buffers
+            - V.graph.wrapper_code.freed
+        ):
+            if name in self.name_to_node:
+                node = self.name_to_node[name]
+                if node.can_free():
+                    V.graph.wrapper_code.codegen_free(node.node)
+            elif name in V.graph.graph_inputs:
+                storage = V.graph.graph_inputs[name].data
+                assert isinstance(storage, ir.StorageBox) and storage.is_input_buffer()
+                V.graph.wrapper_code.codegen_free(storage.data)
+        self.buffer_names_to_free.clear()
+    def remove_kernel_local_buffers(self):
+        """
+        Any buffers that are both created and have a last use in the
+        same kernel can be removed.
+        """
+        # V.kernel.store_buffer_names should represent the set of nodes
+        # get fused
+        fused_node_names = V.kernel.store_buffer_names
+        names_to_remove = []
+        for out_buf in V.kernel.store_buffer_names:
+            users = self.name_to_node[out_buf].users
+            assert users is not None
+            users = {user.get_name() for user in users if not user.is_weak}
+            if users.issubset(fused_node_names):
+                names_to_remove.append(out_buf)
+        def remove_filter(n):
+            return (
+                n not in V.kernel.must_keep_buffers
+                and n not in V.kernel.args.input_buffers
+                and n not in self.mutation_renames
+                and n not in self.mutation_real_name
+            )
+        names_to_remove = list(filter(remove_filter, names_to_remove))
+        for name in names_to_remove:
+            if name in V.kernel.args.inplace_buffers:
+                buf = V.kernel.args.inplace_buffers[name]
+                if isinstance(buf, str) and buf.startswith("REMOVED"):
+                    continue
+                remove = all(n in names_to_remove for n in buf.other_names)
+                if remove:
+                    self.remove_inplace_buffer(name)
+                V.kernel.inplaced_to_remove.add(name)
+            else:
+                self.remove_buffer(name)
+    def remove_buffer(self, name):
+        # Assign a special value instead of deleting the entry
+        # because we still rely on output_buffers's length to
+        # generate unique arg name.
+        log.debug("remove_buffer(%r)", name)
+        V.kernel.args.output_buffers[name] = "REMOVED"
+        V.kernel.removed_buffers.add(name)
+    def remove_inplace_buffer(self, name):
+        log.debug("removing_inplace_buffer(%r)", name)
+        inner_name = V.kernel.args.inplace_buffers[name].inner_name
+        V.kernel.args.inplace_buffers[name] = inner_name.replace(
+            "in_out_ptr", "REMOVED"
+        )
+        V.kernel.removed_buffers.add(name)
+    def flush(self):
+        for backend in self.backends.values():
+            backend.flush()
+        self.free_buffers()
+    def codegen_extern_call(self, scheduler_node: ExternKernelSchedulerNode):
+        assert isinstance(scheduler_node, ExternKernelSchedulerNode)
+        # 'decide_inplace_update' stores the inplace update decisions in
+        # the current kernel from where 'allocate' retrieve those decisions.
+        # We have to make sure there is a non-NULL kernel handler to store
+        # those inplace update decisions.
+        with V.set_kernel_handler(Kernel(increase_kernel_count=False)):
+            scheduler_node.decide_inplace_update()
+            scheduler_node.allocate()
+        node = scheduler_node.node
+        assert isinstance(node, ir.ExternKernel), f"{type(node)=}"
+        node.codegen(V.graph.wrapper_code)
+        self.free_buffers()
+    def create_backend(self, device: torch.device):
+        assert (
+            device.type != "cuda" or device.index is not None
+        ), f"{device} should have been normalized in lowering"
+        V.graph.add_device_info(device)
+        device_scheduling = get_scheduling_for_device(device.type)
+        if device_scheduling is None:
+            raise RuntimeError(f"Unsupported device type: {device.type}")
+        if device.type == "cuda" and not has_triton():
+            device_props = torch.cuda.get_device_properties(device)
+            if device_props.major < 7:
+                raise RuntimeError(
+                    f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
+                )
+            else:
+                raise RuntimeError(
+                    "Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton"  # noqa: B950
+                )
+        return device_scheduling(self)
+    def get_backend(self, device: torch.device):
+        if device not in self.backends:
+            self.backends[device] = self.create_backend(device)
+        return self.backends[device]
+    def enter_context(self, node):
+        def get_order(n):
+            if n not in self.origin_to_index:
+                self.origin_to_index.update({n: i for i, n in enumerate(n.graph.nodes)})
+            return self.origin_to_index[n]
+        # Use a dict to have ordering
+        origins = {
+            (get_order(e), e): None for n in node.get_nodes() for e in n.node.origins
+        }
+        origins = list(origins.keys())
+        if origins:
+            _, last = max(origins, key=operator.itemgetter(0))
+            V.graph.wrapper_code.enter_context(last)
+    @dynamo_timed
+    def codegen(self):
+        for node in self.nodes:
+            try:
+                log.debug(
+                    "Generating code for node %s with estimated runtime %f",
+                    node.get_name(),
+                    node.get_estimated_runtime(),
+                )
+            except Exception as e:
+                log.debug(
+                    "Generating code for node %s with estimated runtime 0.0",
+                    node.get_name(),
+                )
+            self.enter_context(node)
+            if not isinstance(node, NopKernelSchedulerNode):
+                device = node.get_device()
+                if (
+                    device != self.current_device
+                    or node.is_extern()
+                    or node.is_template()
+                ):
+                    self.flush()
+                if device != self.current_device:
+                    if device.type == "cuda":
+                        if self.current_device and self.current_device.type == "cuda":
+                            V.graph.wrapper_code.codegen_device_guard_exit()
+                        assert device.index is not None, "device should have an index"
+                        V.graph.wrapper_code.codegen_device_guard_enter(device.index)
+                    elif self.current_device and self.current_device.type == "cuda":
+                        V.graph.wrapper_code.codegen_device_guard_exit()
+                    self.current_device = device
+            self.buffer_names_to_free.update(node.last_usage)
+            if node.is_template():
+                node, *epilogue = node.get_nodes()
+                self.get_backend(device).codegen_template(node, epilogue)  # type: ignore[possibly-undefined]
+            elif node.is_extern():
+                self.codegen_extern_call(node)
+            elif node.is_foreach():
+                self.get_backend(device).codegen_foreach(node)  # type: ignore[possibly-undefined]
+            elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
+                self.get_backend(device).codegen_nodes(node.get_nodes())  # type: ignore[possibly-undefined]
+            else:
+                assert isinstance(node, NopKernelSchedulerNode)
+                node.allocate()
+            if config.debug_check_inf_and_nan:
+                V.graph.wrapper_code.generate_inf_and_nan_checker(node)
+            if config.triton.debug_sync_kernel:
+                self.get_backend(device).codegen_sync()  # type: ignore[possibly-undefined]
+            self.available_buffer_names.update(node.get_names())
+            if not isinstance(node, NopKernelSchedulerNode):
+                device = node.get_device()
+                if self.get_backend(device).ready_to_flush():
+                    self.flush()
+        if self.current_device and self.current_device.type == "cuda":
+            # exit the outermost CUDA device guard. this is
+            # important for nested indentation codegen-ing.
+            V.graph.wrapper_code.codegen_device_guard_exit()
+        self.flush()
+    def is_unaligned_buffer(self, buf_name):
+        if buf_name in V.graph.graph_inputs or buf_name in V.graph.constants:
+            # all graph inputs or constants are assumed to be aligned
+            return False
+        node = self.name_to_node[buf_name]
+        layout = node.node.get_layout()
+        if isinstance(layout, ir.AliasedLayout):
+            return not layout.maybe_guard_aligned()
+        else:
+            return False
+class BaseScheduling:
+    def can_fuse_vertical(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Check whether node1 and node2 can be vertically fused or not.
+        """
+        raise NotImplementedError()
+    def can_fuse_horizontal(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Check whether node1 and node2 can be horizontally fused or not.
+        """
+        raise NotImplementedError()
+    def fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Fuse two nodes
+        """
+        if node1.is_foreach() or node2.is_foreach():
+            return ForeachKernelSchedulerNode.fuse(node1, node2)
+        else:
+            return FusedSchedulerNode.fuse(node1, node2)
+    def group_fn(self, sizes):
+        """
+        Process the iteration sizes in case a transformation needs to be applied.
+        """
+        raise NotImplementedError()
+    def codegen_template(
+        self, template_node: SchedulerNode, epilogue_nodes: List[SchedulerNode]
+    ):
+        """
+        Given a template node, generate a kernel.
+        This function is only available for triton now. If the third-party backend behaves as a sub-class
+        of TritonScheduling, it can override it or reuse it.
+        """
+        raise NotImplementedError()
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
+        """
+        Generate a kernel given a list of pre-fused nodes.
+        """
+        raise NotImplementedError()
+    def codegen_sync(self):
+        """
+        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
+        """
+        raise NotImplementedError()
+    def ready_to_flush(self) -> bool:
+        """
+        Check whether the backend is requesting the scheduler to flush the generated kernel.
+        If not supported, please return False.
+        """
+        return False
+    def flush(self):
+        """
+        Flush the generated kernel and python wrapper code to the source code file.
+        """
+        raise NotImplementedError()
+    def benchmark_fused_nodes(self, nodes):
+        """
+        Benchmark fused list of nodes and return the execution time
+        in milliseconds on randomly generated inputs.
+        """
+        raise NotImplementedError()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py ADDED Viewed

	@@ -0,0 +1,1156 @@

+import builtins
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import sys
+import textwrap
+import time
+from concurrent.futures import ThreadPoolExecutor
+from io import StringIO
+from typing import Any, Callable, Dict, List, Optional, Union
+from unittest.mock import patch
+import sympy
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import counters, identity, preserve_rng_state
+from . import config, ir
+from .autotune_process import TensorMeta, TritonBenchmarkRequest
+from .codecache import code_hash, PersistentCache, PyCodeCache
+from .codegen.common import (
+    ChoiceCaller,
+    IndentedBuffer,
+    KernelTemplate,
+    PrimitiveInfoType,
+)
+from .codegen.triton import (
+    gen_common_triton_imports,
+    texpr,
+    TritonKernel,
+    TritonPrinter,
+    TritonScheduling,
+)
+from .codegen.triton_utils import config_of, signature_to_meta
+from .exc import CUDACompileError
+from .utils import (
+    do_bench,
+    get_dtype_size,
+    Placeholder,
+    sympy_dot,
+    sympy_product,
+    unique,
+)
+from .virtualized import V
+log = logging.getLogger(__name__)
+# correctness checks struggle with fp16/tf32
+VERIFY: Dict[str, Any] = dict()
+PRINT_AUTOTUNE = True
+DEBUG = False
+class KernelNamespace:
+    pass
+# these objects are imported from the generated wrapper code
+extern_kernels = KernelNamespace()
+class PartialRender:
+    """
+    Some parts of a template need to be generated at the end, but
+    inserted into the template at the start.  This allows doing a bunch
+    of replacements after the initial render.
+    """
+    def __init__(self, code, replacement_hooks):
+        super().__init__()
+        self.code = code
+        self.replacement_hooks = replacement_hooks
+    def finalize(self):
+        code = self.code
+        assert code is not None, "can only be called once"
+        self.code = None
+        for key, fn in self.replacement_hooks.items():
+            code = code.replace(key, fn())
+        return code
+class TritonTemplateKernel(TritonKernel):
+    def __init__(
+        self,
+        kernel_name,
+        input_nodes,
+        output_node,
+        defines,
+        num_stages,
+        num_warps,
+        grid_fn,
+        meta,
+        call_sizes,
+        use_jit=True,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        *,
+        index_dtype,
+    ):
+        super().__init__(
+            sympy_product(output_node.get_size()),
+            sympy.Integer(1),
+            index_dtype=index_dtype,
+        )
+        self.input_nodes = input_nodes
+        self.output_node = output_node
+        self.named_input_nodes = {}
+        self.defines = defines
+        self.kernel_name = kernel_name
+        self.template_mask = None
+        self.use_jit = use_jit
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.grid_fn = grid_fn
+        self.meta = meta
+        self.call_sizes = call_sizes
+        # for templates with fixed epilogues
+        self.prefix_args = prefix_args
+        self.suffix_args = suffix_args
+        self.epilogue_fn = epilogue_fn
+        self.render_hooks = dict()
+        self.triton_meta: Optional[Dict[str, object]] = None
+    def need_numel_args(self):
+        return False
+    def estimate_kernel_num_bytes(self):
+        """
+        Estimate the total number of bytes this kernel takes.
+        For in/out nodes, sizes are counted twice: once for reading and
+        once for writing.
+        """
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        num_bytes = []
+        for i, inp in enumerate(itertools.chain(self.input_nodes, (self.output_node,))):
+            size = V.graph.sizevars.size_hints(inp.get_size())
+            numel = functools.reduce(operator.mul, size)
+            dtype_size = get_dtype_size(inp.get_dtype())
+            num_bytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(num_bytes)
+    def jit_lines(self):
+        if self.use_jit:
+            return "@triton.jit"
+        argdefs, _, signature = self.args.python_argdefs()
+        triton_meta = {
+            "signature": signature_to_meta(signature, size_dtype=self.index_dtype),
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            "constants": {},
+        }
+        triton_meta["configs"] = [config_of(signature)]
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+        self.triton_meta = triton_meta
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+        if config.profile_bandwidth or config.benchmark_kernel:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+        return f"""
+            @triton_heuristics.template(
+                num_stages={self.num_stages},
+                num_warps={self.num_warps},
+                triton_meta={triton_meta!r},
+                inductor_meta={inductor_meta!r},
+            )
+            @triton.jit
+        """
+    def def_kernel(self, *argnames):
+        """
+        Hook called from template code to generate function def and
+        needed args.
+        """
+        assert all(isinstance(x, str) for x in argnames)
+        renames = IndentedBuffer(initial_indent=1)
+        named_args = self.input_nodes[
+            self.prefix_args : len(self.input_nodes) - self.suffix_args
+        ]
+        assert len(argnames) == len(named_args), (
+            len(argnames),
+            len(named_args),
+            self.prefix_args,
+            len(self.input_nodes),
+        )
+        for input_node in self.input_nodes[: self.prefix_args]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+        for name, input_node in zip(argnames, named_args):
+            arg_name = f"arg_{name}"
+            self.named_input_nodes[name] = input_node
+            self.args.input_buffers[input_node.get_name()] = arg_name
+        # The args may be duplicated, so renaming must be after args are de-duplicated.
+        for name in argnames:
+            input_node = self.named_input_nodes[name]
+            arg_name = self.args.input_buffers[input_node.get_name()]
+            if input_node.get_layout().offset == 0:
+                renames.writeline(f"{name} = {arg_name}")
+            else:
+                offset = texpr(self.rename_indexing(input_node.get_layout().offset))
+                renames.writeline(f"{name} = {arg_name} + {offset}")
+        for input_node in self.input_nodes[len(self.input_nodes) - self.suffix_args :]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+        def hook():
+            # python_argdefs() cannot be run until after the rest of the template lazily adds more args
+            arg_defs, *_ = self.args.python_argdefs()
+            code = IndentedBuffer()
+            code.splice(gen_common_triton_imports())
+            code.splice(self.jit_lines())
+            code.writeline(f"def {self.kernel_name}({', '.join(arg_defs)}):")
+            with code.indent():
+                code.splice(self.defines)
+                code.splice(renames.getvalue())
+            return code.getvalue()
+        assert "<DEF_KERNEL>" not in self.render_hooks
+        self.render_hooks["<DEF_KERNEL>"] = hook
+        return "<DEF_KERNEL>"
+    def size(self, name: str, index: int):
+        """
+        Hook called from template code to get the size of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        assert isinstance(index, int)
+        if name is None:
+            val = self.output_node.get_size()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_size()[index]
+        return texpr(self.rename_indexing(val))
+    def stride(self, name, index):
+        """
+        Hook called from template code to get the stride of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        assert isinstance(index, int)
+        if name is None:
+            val = self.output_node.get_stride()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_stride()[index]
+        return texpr(self.rename_indexing(val))
+    def store_output(self, indices, val, mask):
+        """
+        Hook called from template code to store the final output
+        (if the buffer hasn't been optimized away), then append any
+        epilogue fusions.
+        """
+        assert isinstance(indices, (list, tuple))
+        assert isinstance(val, str)
+        assert isinstance(mask, str)
+        assert self.template_mask is None
+        indices = list(map(TritonPrinter.paren, indices))
+        index_symbols = [sympy.Symbol(x) for x in indices]
+        lengths = [V.graph.sizevars.simplify(s) for s in self.output_node.get_size()]
+        assert len(indices) == len(lengths)
+        # glue to make generated code use same indexing from template
+        for name, range_tree_entry in zip(
+            indices, self.range_trees[0].construct_entries(lengths)
+        ):
+            range_tree_entry.set_name(name)
+        contiguous_index = sympy_dot(
+            ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
+        )
+        contiguous_index = self.rename_indexing(contiguous_index)
+        self.body.writeline("xindex = " + texpr(contiguous_index))
+        self.range_trees[0].lookup(sympy.Integer(1), sympy_product(lengths)).set_name(
+            "xindex"
+        )
+        self.template_mask = mask
+        self.template_indices = indices
+        output_index = self.output_node.get_layout().make_indexer()(index_symbols)
+        output_index = self.rename_indexing(output_index)
+        if output_index == contiguous_index:
+            output_index = sympy.Symbol("xindex")
+        epilogue_args = [val]
+        for input_node in itertools.chain(
+            self.input_nodes[: self.prefix_args],
+            self.input_nodes[len(self.input_nodes) - self.suffix_args :],
+        ):
+            input_node.freeze_layout()
+            epilogue_args.append(input_node.make_loader()(index_symbols))
+        V.ops.store(
+            self.output_node.get_name(),
+            output_index,
+            self.epilogue_fn(*epilogue_args),
+        )
+        self.codegen_body()
+        def hook():
+            # more stuff might have been added since the codegen_body above
+            self.codegen_body()
+            return textwrap.indent(self.body.getvalue(), "    ").strip()
+        assert "<STORE_OUTPUT>" not in self.render_hooks
+        self.render_hooks["<STORE_OUTPUT>"] = hook
+        return "<STORE_OUTPUT>"
+    def render(self, template, kwargs):
+        return PartialRender(
+            template.render(**self.template_env(), **kwargs),
+            self.render_hooks,
+        )
+    def make_load(self, name, indices, mask):
+        """
+        Optional helper called from template code to generate the code
+        needed to load from an tensor.
+        """
+        assert isinstance(indices, (list, tuple))
+        assert isinstance(name, str)
+        assert isinstance(mask, str)
+        stride = self.named_input_nodes[name].get_stride()
+        indices = list(map(TritonPrinter.paren, indices))
+        assert len(indices) == len(stride)
+        index = " + ".join(
+            f"{texpr(self.rename_indexing(s))} * {i}" for s, i in zip(stride, indices)
+        )
+        return f"tl.load({name} + ({index}), {mask})"
+    def template_env(self):
+        """
+        Generate the namespace visible in the template.
+        """
+        return {
+            fn.__name__: fn
+            for fn in [
+                self.def_kernel,
+                self.size,
+                self.stride,
+                self.store_output,
+                self.make_load,
+            ]
+        }
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        dense_indexing=False,
+        copy_shape=None,
+        override_mask=None,
+        block_ptr=False,
+    ):
+        """
+        Override the default indexing to use our custom mask and force
+        dense indexing.
+        """
+        return super().indexing(
+            index,
+            dense_indexing=False,
+            copy_shape=self.template_mask,
+            override_mask=self.template_mask,
+            block_ptr=block_ptr,
+        )
+    def initialize_range_tree(self, pid_cache):
+        super().initialize_range_tree(pid_cache)
+        # ignore default codegen
+        self.body.clear()
+        self.indexing_code.clear()
+    def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        _, call_args, _ = self.args.python_argdefs()
+        call_args = [str(a) for a in call_args]
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+            if isinstance(call_args[i], sympy.Symbol):
+                call_args[i] = texpr(call_args[i])
+        if V.graph.cpp_wrapper:
+            # In the cpp_wrapper case, we have to compute CUDA launch grid at runtime
+            # if any dynamic dimension is involved. We rely on the Python version
+            # of the grid function to generate those grid configs, which may contain
+            # symbolic values. The wrapper will use cexpr to print out C++ code
+            # appropriately for the grid configs.
+            grid_args = [V.graph.sizevars.simplify(s) for s in self.call_sizes] + [
+                self.meta
+            ]
+            grid = self.grid_fn(*grid_args)
+            wrapper.generate_kernel_call(
+                name,
+                call_args,
+                device_index=V.graph.scheduler.current_device.index,
+                grid=grid,
+                triton_meta=self.triton_meta,
+            )
+        else:
+            stream_name = wrapper.write_get_raw_stream(
+                V.graph.scheduler.current_device.index
+            )
+            wrapper.add_import_once(f"import {self.grid_fn.__module__}")
+            meta = wrapper.add_meta_once(self.meta)
+            grid_call = [
+                texpr(V.graph.sizevars.simplify(s)) for s in self.call_sizes
+            ] + [meta]
+            grid_call = f"{self.grid_fn.__module__}.{self.grid_fn.__name__}({', '.join(grid_call)})"
+            wrapper.writeline(
+                f"{name}.run({', '.join(call_args)}, grid={grid_call}, stream={stream_name})"
+            )
+@functools.lru_cache(None)
+def _jinja2_env():
+    try:
+        import jinja2
+        return jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+        )
+    except ImportError:
+        return None
+class TritonTemplate(KernelTemplate):
+    index_counter = itertools.count()
+    all_templates: Dict[str, "TritonTemplate"] = dict()
+    def __init__(self, name: str, grid: Any, source: str, debug=False):
+        super().__init__(name)
+        self.grid = grid
+        self.template = self._template_from_string(source)
+        assert name not in self.all_templates, "duplicate template name"
+        self.all_templates[name] = self
+        self.debug = debug
+    def generate(
+        self,
+        input_nodes,
+        layout,
+        num_stages,
+        num_warps,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        **kwargs,
+    ):
+        assert self.template, "requires jinja2"
+        defines = StringIO()
+        for name, val in kwargs.items():
+            defines.write(f"    {name} : tl.constexpr = {val}\n")
+        defines = defines.getvalue()
+        fake_out = ir.Buffer("buf_out", layout)
+        kernel_name = f"triton_{self.name}"
+        numel = sympy_product(layout.size)
+        buffers = itertools.chain(input_nodes, (fake_out,))
+        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            raise NotImplementedError(
+                "64-bit indexing is not yet implemented for triton templates"
+            )
+        kernel_options = dict(
+            input_nodes=input_nodes,
+            defines=defines,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            grid_fn=self.grid,
+            meta=kwargs,
+            call_sizes=layout.size,
+            prefix_args=prefix_args,
+            suffix_args=suffix_args,
+            epilogue_fn=epilogue_fn,
+            index_dtype="tl.int32",
+        )
+        with patch.object(
+            V.graph, "get_dtype", self._fake_get_dtype(fake_out)
+        ), TritonTemplateKernel(
+            kernel_name=kernel_name,
+            output_node=fake_out,
+            use_jit=True,
+            **kernel_options,
+        ) as kernel:
+            try:
+                code = kernel.render(self.template, kwargs).finalize()
+            except ZeroDivisionError:
+                # TODO(nmacchioni): fix sympy division by zero
+                return None
+            if self.debug:
+                print("Generated Code:\n", code)
+            extra = (
+                "-".join(
+                    [
+                        *[
+                            f"{kwarg}={repr(kwargs[kwarg])}"
+                            for kwarg in sorted(kwargs.keys())
+                        ],
+                        f"num_stages={num_stages}",
+                        f"num_warps={num_warps}",
+                    ]
+                )
+                + "-"
+            )
+            mod = PyCodeCache.load(code, extra)
+            _, call_args, _ = kernel.args.python_argdefs()
+        expected_args = list(unique(x.get_name() for x in input_nodes))
+        expected_args.extend([fake_out.get_name()])
+        assert list(call_args)[: len(expected_args)] == expected_args, (
+            call_args,
+            expected_args,
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, call_args[len(expected_args) :]),
+            fallback=config.unbacked_symint_fallback,
+        )
+        kernel_hash_name = f"triton_{self.name}_{next(self.index_counter)}"
+        def make_kernel_render(out_node):
+            kernel = TritonTemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME),
+                output_node=out_node,
+                use_jit=False,
+                **kernel_options,
+            )
+            render = functools.partial(
+                kernel.render,
+                self.template,
+                kwargs,
+            )
+            return kernel, render
+        # create the BenchmarkRequest
+        assert mod.__file__ is not None
+        grid = self.grid(
+            *V.graph.sizevars.size_hints(
+                layout.size,
+                fallback=config.unbacked_symint_fallback,
+            ),
+            kwargs,
+        )
+        bmreq = TritonBenchmarkRequest(
+            module_path=mod.__file__,
+            module_cache_key=mod.key,
+            kernel_name=kernel_name,
+            grid=grid,
+            extra_args=extra_args,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
+            input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(layout),
+        )
+        return TritonTemplateCaller(
+            kernel_hash_name,
+            input_nodes,
+            layout,
+            make_kernel_render,
+            extra.strip("-").replace("-", ", "),
+            bmreq,
+            log_info={
+                "tile_shape": str(
+                    (
+                        kwargs.get("BLOCK_M", -1),
+                        kwargs.get("BLOCK_K", -1),
+                        kwargs.get("BLOCK_N", -1),
+                    )
+                ),
+                "num_stages": num_stages,
+                "num_warps": num_warps,
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
+                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+            },
+        )
+class ExternKernelChoice:
+    def __init__(
+        self,
+        kernel,
+        cpp_kernel=None,
+        *,
+        name=None,
+        has_out_variant=True,
+        op_overload=None,
+        use_fallback_kernel=False,
+    ):
+        super().__init__()
+        name = name or kernel.__name__
+        assert callable(kernel)
+        assert not hasattr(extern_kernels, name), "duplicate extern kernel"
+        self.name = name
+        self.cpp_kernel_name = cpp_kernel
+        self.has_out_variant = has_out_variant
+        setattr(extern_kernels, name, kernel)
+        self.op_overload = op_overload
+        self.use_fallback_kernel = use_fallback_kernel
+    def to_callable(self):
+        return getattr(extern_kernels, self.name)
+    def call_name(self):
+        return f"extern_kernels.{self.name}"
+    @functools.lru_cache(None)
+    def hash_key(self):
+        fn = self.to_callable()
+        parts = [
+            self.name,
+            getattr(fn, "__name__", ""),
+            getattr(fn, "__module__", ""),
+        ]
+        try:
+            parts.append(inspect.getsource(fn))
+        except Exception:
+            pass
+        return code_hash("-".join(parts))
+    def bind(
+        self,
+        input_nodes,
+        layout,
+        ordered_kwargs_for_cpp_kernel=(),
+        **kwargs,
+    ):
+        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
+        return ExternKernelCaller(
+            self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
+        )
+class TritonTemplateCaller(ChoiceCaller):
+    def __init__(
+        self,
+        name,
+        input_nodes,
+        layout,
+        make_kernel_render,
+        debug_extra,
+        bmreq,
+        log_info: Optional[
+            Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]
+        ] = None,
+    ):
+        super().__init__(name, input_nodes, layout)
+        self.make_kernel_render = make_kernel_render
+        self.debug_extra = debug_extra
+        self.bmreq: TritonBenchmarkRequest = bmreq
+        if log_info is None:
+            log_info = {}
+        self.log_info: Dict[str, Any] = log_info
+        self.log_info.update(
+            {
+                "backend": "Triton",
+                "grid": str(self.bmreq.grid),
+                "num_stages": self.bmreq.num_stages,
+                "num_warps": self.bmreq.num_warps,
+            }
+        )
+    def benchmark(self, *args, out):
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(*args, output_tensor=out)
+    def __str__(self):
+        return f"TritonTemplateCaller({self.bmreq.module_path}, {self.debug_extra})"
+    def call_name(self):
+        return f"template_kernels.{self.name}"
+    def hash_key(self):
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                self.bmreq.module_cache_key,
+            ]
+        )
+    def output_node(self):
+        return ir.TensorBox.create(
+            ir.TritonTemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+            )
+        )
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return self.log_info
+class ExternKernelCaller(ChoiceCaller):
+    def __init__(
+        self,
+        choice: ExternKernelChoice,
+        input_nodes,
+        layout,
+        kwargs=None,
+        *,
+        has_out_variant=True,
+    ):
+        super().__init__(choice.name, input_nodes, layout)
+        self.choice = choice
+        self.kwargs = kwargs or {}
+        self.has_out_variant = has_out_variant
+    def __str__(self):
+        return f"ExternKernelCaller({self.choice.call_name()})"
+    def benchmark(self, *args, out):
+        if self.has_out_variant:
+            return super().benchmark(*args, out=out)
+        else:
+            algo = self.to_callable()
+            out_new = algo(*args)
+            torch._C._dynamo.guards.assert_size_stride(
+                out_new, tuple(out.size()), tuple(out.stride())
+            )
+            out.copy_(out_new)  # for correctness checking
+            return do_bench(lambda: algo(*args))
+    def to_callable(self):
+        fn = self.choice.to_callable()
+        if self.kwargs:
+            return functools.partial(fn, **self.kwargs)
+        else:
+            return fn
+    def hash_key(self):
+        return "-".join(
+            [
+                self.choice.name,
+                *[
+                    f"{kwarg}={repr(self.kwargs[kwarg])}"
+                    for kwarg in sorted(self.kwargs.keys())
+                ],
+                self.choice.hash_key(),
+            ]
+        )
+    def output_node(self):
+        if config.abi_compatible and self.choice.use_fallback_kernel:
+            assert (
+                self.choice.op_overload is not None
+            ), "Please provide an op_overload to use ir.FallbackKernel"
+            inner = ir.FallbackKernel.create(
+                self.choice.op_overload, *self.input_nodes, **self.kwargs
+            )
+        else:
+            cls = ir.ExternKernelOut if self.has_out_variant else ir.ExternKernelAlloc
+            inner = cls(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                python_kernel_name=self.choice.call_name(),
+                cpp_kernel_name=self.choice.cpp_kernel_name,
+                ordered_kwargs_for_cpp_kernel=self.choice.ordered_kwargs_for_cpp_kernel,
+                op_overload=self.choice.op_overload,
+                kwargs=self.kwargs,
+            )
+        return ir.TensorBox.create(inner)
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "extern",
+            "kernel_call_name": self.choice.call_name(),
+        }
+class ErrorFromChoice(RuntimeError):
+    def __init__(self, msg, choice: ChoiceCaller, inputs_str):
+        msg += f"\nFrom choice {choice}\n{inputs_str}"
+        super().__init__(msg)
+        self.choice = choice
+class AlgorithmSelectorCache(PersistentCache):
+    def __call__(
+        self,
+        name,
+        choices: List[ChoiceCaller],
+        input_nodes,
+        layout,
+        # optional dict mapping arg indices to the functions
+        # generating a torch.Tensor for that input from the
+        # corresponding ir.Buffer. if passed for a given
+        # arg, the function will be called instead of
+        # generating a random torch.Tensor for benchmarking.
+        input_gen_fns: Optional[Dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
+        precompilation_timeout_seconds: int = 60 * 60,
+    ):
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+        # TODO(nmacchioni): remove once CI tests are fixed
+        choices = [choice for choice in choices if choice is not None]
+        if len(choices) == 0:
+            raise RuntimeError(
+                "No choices to select, please consider adding ATEN into max_autotune_gemm_backends "
+                "config (defined in torch/_inductor/config.py) to allow at least one choice. "
+            )
+        log.debug("Max autotune selects from %s choices.", str(len(choices)))
+        if len(choices) == 1:
+            if not isinstance(choices[0], CUDATemplateCaller):
+                # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
+                return choices[0].output_node()
+        @functools.lru_cache(None)
+        def make_benchmark_fn():
+            return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
+        def precompile(choices):
+            if (
+                precompilation_timeout_seconds is None
+                or precompilation_timeout_seconds <= 0
+            ):
+                return
+            num_workers = min(
+                config.compile_threads,
+                torch.get_num_threads(),
+                len(choices),
+            )
+            if num_workers <= 0:
+                return
+            log.info(
+                "Multithreaded precompilation for %d choices using %d worker threads",
+                len(choices),
+                num_workers,
+            )
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                futures = executor.map(
+                    lambda c: c.precompile(),
+                    [c for c in choices if hasattr(c, "precompile")],
+                    timeout=precompilation_timeout_seconds,
+                )
+                try:
+                    iterator = iter(futures)
+                    while True:
+                        try:
+                            next(iterator)
+                        except CUDACompileError:
+                            log.error(  # noqa: G201
+                                "CUDA Compilation error", exc_info=True
+                            )
+                except TimeoutError:
+                    log.warning(
+                        f"Precompilation timed out after {precompilation_timeout_seconds} seconds."  # noqa: G004
+                    )
+                except StopIteration:
+                    pass
+                executor.shutdown(wait=True)
+        def autotune(choices):
+            try:
+                precompile(choices)
+            except TimeoutError:
+                log.warning(
+                    "Precompilation phase took longer than timeout allowed. Continuing"
+                )
+                pass
+            return make_benchmark_fn()(choices)
+        if config.autotune_in_subproc:
+            from .autotune_process import tuning_pool
+            # do the optional warmup
+            tuning_pool.initialize()
+        autotune_start_ts = time.time()
+        timings = self.lookup(
+            choices,
+            name,
+            repr([self.key_of(x) for x in input_nodes]),
+            autotune,
+        )
+        autotune_elapse = time.time() - autotune_start_ts
+        if timings == {} or choices[0] not in timings:
+            return choices[0].output_node()
+        if make_benchmark_fn.cache_info().currsize:
+            counters["inductor"]["select_algorithm_autotune"] += 1
+        if (
+            make_benchmark_fn.cache_info().currsize
+            or log.getEffectiveLevel() == logging.DEBUG
+            or config.trace.log_autotuning_results
+        ):
+            self.log_results(name, input_nodes, timings, autotune_elapse)
+        selected_choice = builtins.min(timings, key=timings.__getitem__).output_node()
+        log.debug("selected choice: %s", str(selected_choice))
+        return selected_choice
+    @classmethod
+    def make_benchmark_fn(
+        cls,
+        choices,
+        input_nodes,
+        layout,
+        input_gen_fns=None,
+    ):
+        if input_gen_fns is None:
+            input_gen_fns = {}
+        # de-duplicate args
+        unique_example_inputs = {
+            x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+            for i, x in enumerate(input_nodes)
+        }
+        example_inputs = list(unique_example_inputs.values())
+        example_inputs_extern = [
+            torch.as_strided(
+                unique_example_inputs[input_node.get_name()],
+                V.graph.sizevars.size_hints(
+                    input_node.get_size(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hints(
+                    input_node.get_stride(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hint(
+                    input_node.get_layout().offset,
+                    fallback=config.unbacked_symint_fallback,
+                ),
+            )
+            for input_node in input_nodes
+        ]
+        out = cls.benchmark_example_value(layout)
+        out_extern = torch.as_strided(
+            out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
+        )
+        if VERIFY:
+            choices[0].benchmark(*example_inputs_extern, out=out_extern)
+            expected = out_extern.clone()
+        if DEBUG:
+            print(f"{len(choices)} tuning requests:")
+        def debug_str():
+            def tensor_repr(x):
+                return (
+                    f"torch.empty_strided({tuple(x.size())!r}, {tuple(x.stride())!r}, "
+                    f"dtype={x.dtype!r}, device={x.device.type!r})"
+                )
+            lines = [
+                "inputs = [",
+            ]
+            for x in example_inputs:
+                lines.append(f"    {tensor_repr(x)},")
+            lines += ["]", f"out = {tensor_repr(out)}", ""]
+            return "\n".join(lines)
+        def benchmark_choice_in_current_process(choice):
+            out.zero_()
+            if isinstance(choice, ExternKernelCaller):
+                # aten kernels want the offset baked in for sliced tensors
+                result = choice.benchmark(*example_inputs_extern, out=out_extern)
+            else:
+                # triton templates want the base pointer for sliced tensors
+                result = choice.benchmark(*example_inputs, out=out)
+            if VERIFY:
+                torch.testing.assert_close(out_extern, expected, **VERIFY)
+            torch.cuda.synchronize()  # shake out any CUDA errors
+            return result
+        def benchmark_in_current_process(choices):
+            timings = {}
+            for choice in choices:
+                try:
+                    timing = benchmark_choice_in_current_process(choice)
+                except CUDACompileError as e:
+                    log.warning(
+                        "CUDA compilation error: \n%s. \nIgnore this choice.", str(e)
+                    )
+                    timing = float("inf")
+                except RuntimeError as e:
+                    msg = str(e)
+                    if "invalid argument" in msg:
+                        msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                        log.warning(msg)
+                        timing = float("inf")
+                    else:
+                        if "illegal memory access" in msg:
+                            msg += "\n\nEither error in template or triton bug.\n"
+                        raise ErrorFromChoice(msg, choice, debug_str())  # noqa: TRY200
+                except AssertionError as e:
+                    raise AssertionError(  # noqa: TRY200
+                        f"Incorrect result from choice {choice}\n\n{e}"
+                    )
+                timings[choice] = timing
+            return timings
+        def benchmark_in_sub_process(choices):
+            from . import autotune_process
+            # only benchmark triton kernel in sub process for now.
+            # ATen/Extern kernel are still benchmarked in the current process.
+            extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
+            triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+            timings = benchmark_in_current_process(extern)
+            timings.update(autotune_process.benchmark_in_sub_process(triton))
+            return timings
+        benchmark = (
+            benchmark_in_sub_process
+            if config.autotune_in_subproc
+            else benchmark_in_current_process
+        )
+        return benchmark
+    @staticmethod
+    def log_results(
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict[ChoiceCaller, float],
+        elapse: float,
+    ):
+        V.debug.log_autotuning_results(name, input_nodes, timings, elapse)
+        if not (config.max_autotune or config.max_autotune_gemm) or not PRINT_AUTOTUNE:
+            return
+        sizes = ", ".join(
+            [
+                "x".join(
+                    map(
+                        str,
+                        V.graph.sizevars.size_hints(
+                            n.get_size(), fallback=config.unbacked_symint_fallback
+                        ),
+                    )
+                )
+                for n in input_nodes
+            ]
+        )
+        n = None if log.getEffectiveLevel() == logging.DEBUG else 10
+        top_k = sorted(timings, key=timings.__getitem__)[:n]
+        best = top_k[0]
+        best_time = timings[best]
+        sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
+        for choice in top_k:
+            result = timings[choice]
+            if result:
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms {best_time/result:.1%}\n"
+                )
+            else:
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms <DIVIDED BY ZERO ERROR>\n"
+                )
+        autotune_type_str = (
+            "SubProcess" if config.autotune_in_subproc else "SingleProcess"
+        )
+        sys.stderr.write(f"{autotune_type_str} AUTOTUNE takes {elapse:.4f} seconds\n")
+    @staticmethod
+    def benchmark_example_value(node):
+        """
+        Convert an ir.Buffer into a concrete torch.Tensor we can use for
+        benchmarking.
+        """
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+        # triton templates want the base tensor.
+        if isinstance(node, ir.BaseView):
+            node = node.unwrap_view()
+        # preserve rng states to avoid the rand_strided call below changes
+        # the rng states for the real model code.
+        with preserve_rng_state():
+            return rand_strided(
+                V.graph.sizevars.size_hints(
+                    node.get_size(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hints(
+                    node.get_stride(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                device=node.get_device(),
+                dtype=node.get_dtype(),
+                extra_size=node.layout.offset,
+            )
+    @staticmethod
+    def key_of(node):
+        """
+        Extract the pieces of an ir.Buffer that we should invalidate cached
+        autotuning results on.
+        """
+        sizevars = V.graph.sizevars
+        return (
+            node.get_device().type,
+            str(node.get_dtype()),
+            *sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            *sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+        )
+_ALGORITHM_SELECTOR_CACHE: Optional[AlgorithmSelectorCache] = None
+def autotune_select_algorithm(*args, **kwargs):
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+def realize_inputs(*args):
+    if len(args) == 1:
+        return ir.ExternKernel.require_stride1(ir.ExternKernel.realize_input(args[0]))
+    return [realize_inputs(x) for x in args]
+# ensure lowering is imported so that `extern_kernels.*` is populated
+from . import lowering  # noqa: F401

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/sizevars.py ADDED Viewed

	@@ -0,0 +1,643 @@

+import functools
+import itertools
+import logging
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+import sympy
+from sympy import Expr
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import bound_sympy
+from .utils import sympy_index_symbol, sympy_subs, VarRanges
+from .virtualized import V
+log = logging.getLogger(__name__)
+# This class is a little awkward, because ShapeEnv is doing most of the heavy
+# lifting and in some cases we should be directly passing through to ShapeEnv,
+# but there is some extra inductor logic that needs to be handled here
+class SizeVarAllocator:
+    def __init__(self, shape_env=None):
+        super().__init__()
+        if shape_env is None:
+            shape_env = ShapeEnv()
+        self.shape_env = shape_env
+        self.var_to_val = self.shape_env.var_to_val
+        self.replacements: Dict[sympy.Symbol, Expr] = self.shape_env.replacements
+        # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
+        # The basic idea is if we have some complicated sympy expression
+        # f(s0), we may choose to precompute it on the host and then replace
+        # all occurrences of that sympy expression with ps0, so that when we
+        # codegen we simply reference ps0 directly without repeating
+        # f(s0).  Unlike regular size variables, ps variables cannot be
+        # guarded upon; so if we are asked to guard on a Sympy expression
+        # which potentially could have already had a precomputed replacement
+        # on it, we are obligated to invert the precomputed replacements
+        # (inv_precomputed_replacements).
+        self.precomputed_replacements: Dict[Expr, sympy.Symbol] = dict()
+        self.inv_precomputed_replacements: Dict[sympy.Symbol, Expr] = dict()
+        self.stride_vars = self.make_stride_vars_cache()
+        self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
+        self._simplify_loops = self.make_simplify_loops_cache()
+    def simplify(self, expr: Expr):
+        return sympy.expand(expr).xreplace(self.replacements)
+    def make_simplify_with_ranges_cache(self) -> Callable[[Expr, VarRanges], Expr]:
+        """
+        self._simplify_with_ranges() can be expensive, cache its results
+        """
+        cache: Dict[Tuple[Any, ...], Expr] = dict()
+        replacement_count = len(self.replacements)
+        def simplify_with_ranges(expr: Expr, var_ranges: VarRanges) -> Expr:
+            nonlocal replacement_count
+            if replacement_count != len(self.replacements):
+                # new replacements invalidates cached results
+                cache.clear()
+                replacement_count = len(self.replacements)
+            key = (expr, *var_ranges.items())
+            result = cache.get(key, None)
+            if result is None:
+                result = self._simplify_with_ranges(expr, var_ranges)
+                cache[key] = result
+            return result
+        return simplify_with_ranges
+    def make_simplify_loops_cache(self):
+        """
+        self._simplify_with_ranges() can be expensive, cache its results
+        """
+        cache: Dict[Tuple[Any, ...], Any] = dict()
+        replacement_count = len(self.replacements)
+        def simplify_loops(index_vars, sizes, index_formulas):
+            nonlocal replacement_count
+            if replacement_count != len(self.replacements):
+                # new replacements invalidates cached results
+                cache.clear()
+                replacement_count = len(self.replacements)
+            key = (*index_vars, *sizes, *index_formulas)
+            result = cache.get(key, None)
+            if result is None:
+                result = self._simplify_loops_impl(index_vars, sizes, index_formulas)
+                cache[key] = result
+            return result
+        return simplify_loops
+    def _simplify_with_ranges(self, expr: Expr, var_ranges: VarRanges) -> Expr:
+        """
+        Simplify indexing expression with knowledge of the ranges of
+        iteration variables.
+        """
+        expr = join_dimensions(self.simplify(expr))
+        original_expr = expr
+        def remove_zero_terms(base, divisor):
+            """Symbols smaller than the divisor are zero"""
+            for v in base.free_symbols:
+                if v in var_ranges:
+                    # var smaller than divisor can be removed
+                    # if the rest is guaranteed to be multiple of divisor
+                    rest = sympy.Wild("_rest", exclude=[v])
+                    m = base.match(v + rest)
+                    if m and v not in m[rest].free_symbols:
+                        gcd = sympy.gcd(m[rest], divisor)
+                        if gcd == divisor:
+                            if self.statically_known_leq(var_ranges[v], divisor):
+                                base = m[rest]
+            return base
+        def visit_indexing_div(base, divisor):
+            return FloorDiv(remove_zero_terms(base, divisor), divisor)
+        def visit_modular_indexing(base, divisor, modulus):
+            base = remove_zero_terms(base, divisor)
+            base_pos = True
+            if isinstance(base, ModularIndexing):
+                # for modular indexing, biggest values from the ranges don't necessarily result in
+                # the biggest result, the biggest result is modulus - 1
+                base_s = base.args[2] - 1
+            elif not base.has(ModularIndexing):
+                # actual iteration range is to size-1
+                iter_ranges_zero = {k: 0 for k, v in var_ranges.items()}
+                base_lowest = sympy_subs(base, iter_ranges_zero)
+                if self.statically_known_leq(0, base_lowest):  # type: ignore[arg-type]
+                    # can't replace with indexing div if base can be negative
+                    base_pos = True
+                else:
+                    base_pos = False
+                iter_ranges = {k: v - 1 for k, v in var_ranges.items()}
+                base_s = sympy_subs(base, iter_ranges)
+            else:
+                base_s = base
+            if self.statically_known_lt(base_s, modulus * divisor) and base_pos:
+                return FloorDiv(base, divisor)
+            return ModularIndexing(base, divisor, modulus)
+        if expr.has(ModularIndexing):
+            expr = expr.replace(
+                ModularIndexing(
+                    sympy.Wild("base"),
+                    sympy.Wild("divisor"),
+                    sympy.Wild("modulus"),
+                ),
+                visit_modular_indexing,
+            )
+        if expr.has(FloorDiv):
+            expr = expr.replace(
+                FloorDiv(
+                    sympy.Wild("base"),
+                    sympy.Wild("divisor"),
+                ),
+                visit_indexing_div,
+            )
+        if expr != original_expr:
+            return self._simplify_with_ranges(expr, var_ranges)
+        return expr
+    def _simplify_loops_impl(
+        self, index_vars: List[sympy.Symbol], sizes, index_formulas
+    ):
+        """
+        Try to remove as many axis from loop iterations as possible, by:
+            1) removing size==1 dimensions
+            2) fuse contiguous dimensions into a single loop
+            If channel_last = True, we will prevent the last dim fused with other dims
+        """
+        sizes = list(map(self.simplify, sizes))
+        strides = [self.stride_vars(x, index_vars) for x in index_formulas]
+        assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
+        for i in range(len(sizes)):
+            if sizes[i] == 1:
+                # remove dim
+                sizes[i] = None
+        def can_merge_dims(a, b):
+            for k in range(len(strides)):
+                if self.simplify(strides[k][a] * sizes[a]) == self.simplify(
+                    strides[k][b]
+                ):
+                    # approximate test passed, try sound version
+                    va = index_vars[a]
+                    vb = index_vars[b]
+                    v = sympy_index_symbol("_merge_tester")
+                    expr1 = sympy_subs(index_formulas[k], {va: v * sizes[a], vb: 0})
+                    expr2 = sympy_subs(index_formulas[k], {va: 0, vb: v})
+                    if self.simplify(expr1) == self.simplify(expr2):
+                        continue
+                return False
+            return True
+        changed = True
+        while changed:
+            changed = False
+            for i, j in itertools.product(
+                reversed(range(len(sizes))), reversed(range(len(sizes)))
+            ):
+                if i == j or sizes[i] is None or sizes[j] is None:
+                    continue
+                if can_merge_dims(i, j):
+                    changed = True
+                    sizes[i] = sizes[i] * sizes[j]
+                    sizes[j] = None
+        def reindex(index):
+            it = list(reversed(index))
+            new_index = []
+            for size in sizes:
+                if size is None:
+                    new_index.append(sympy.Integer(0))
+                else:
+                    new_index.append(it.pop())
+            assert not it
+            return new_index
+        def prune(index):
+            assert len(index) == len(sizes)
+            return [i for i, s in zip(index, sizes) if s is not None]
+        return [x for x in sizes if x is not None], reindex, prune
+    # Note - [On Statically Known]
+    #
+    # The statically_known_* family of functions below replaces a prior system, called maybe_guard_*. The prior system
+    # operated by providing essentially a question, where the size hinted values were evaluated. If the condition was
+    # true, we add a guard and return True, otherwise, False.
+    #
+    # def maybe_guard_foo(args):
+    #   if size_hinted_check(args):
+    #       return False # No guard, no optim
+    #   guard(args) # Make a guard
+    #   return True # Safe to apply optimization
+    #
+    # The prior system incurred a guard, and green lit an optimization.
+    #
+    # The new system works in reverse - in the new system, if we know that the inputs are static, and evaluate the
+    # condition as true, we green light the optimization, and we do not incur a guard. If we cannot prove that, we
+    # return False.
+    #
+    # def maybe_guard_foo(args):
+    #   if all_static(args):
+    #       return True # Safe to apply optimization
+    #   else:
+    #       return False # No guard, no optim
+    # See Note - [On Statically Known]
+    def is_expr_static_and_true(self, expr: Union[Expr, int]) -> bool:
+        if expr in (True, False):
+            return bool(expr)
+        try:
+            simplified = self.shape_env._maybe_evaluate_static(expr)
+            if simplified is not None:
+                return bool(simplified)
+        except Exception:
+            log.debug("Could not simplify %s", expr)
+        return False
+    def statically_known_equals(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left and right are equal.
+        """
+        return self.is_expr_static_and_true(sympy.Eq(left, right))  # type: ignore[arg-type]
+    # See Note - [On Statically Known]
+    def statically_known_list_equals(self, left: List[Expr], right: List[Expr]) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left and right lists are equal.
+        """
+        if len(left) != len(right):
+            return False
+        if all(self.statically_known_equals(l, r) for l, r in zip(left, right)):
+            return True
+        return False
+    # See Note - [On Statically Known]
+    def statically_known_leq(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is less than or equal to right.
+        """
+        expr = left <= right
+        return self.is_expr_static_and_true(expr)
+    # See Note - [On Statically Known]
+    def statically_known_lt(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is less than right.
+        """
+        expr = left < right
+        return self.is_expr_static_and_true(expr)
+    # See Note - [On Statically Known]
+    def statically_known_multiple_of(self, numerator: Expr, denominator: Expr) -> bool:
+        """
+        Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
+        """
+        expr = sympy.Eq(numerator % denominator, 0)
+        return self.is_expr_static_and_true(expr)  # type: ignore[arg-type]
+    # The guard functions require you to ALREADY KNOW that a particular
+    # condition holds.  If you don't know (you want to guard on an expression
+    # being a particular value, and then get access to that value), use
+    # the evaluate functions.
+    def guard_equals(self, left: Expr, right: Expr) -> Expr:
+        if isinstance(left, Expr):
+            left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        if isinstance(right, Expr):
+            right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
+        return left
+    def guard_leq(self, left: Expr, right: Expr) -> None:
+        return self.guard_lt(left, right + 1)
+    def guard_lt(self, left: Expr, right: Expr) -> None:
+        assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
+    def expect_true(self, expr: Expr, *, msg: str) -> None:
+        expr = sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        self.shape_env.defer_runtime_assert(expr, msg, fx_node=None)
+    def expect_equals(self, left: Expr, right: Expr, *, msg: str) -> Expr:
+        # Prefer returning the expression without unbacked symints
+        if self.shape_env.is_unbacked_symint(left):
+            self.expect_true(sympy.Eq(left, right), msg=msg)  # type: ignore[arg-type]
+            return right
+        elif self.shape_env.is_unbacked_symint(right):
+            self.expect_true(sympy.Eq(left, right), msg=msg)  # type: ignore[arg-type]
+            return left
+        else:
+            return self.guard_equals(left, right)
+    def guarded_order(self, seq):
+        """
+        Return the order of a sequence as a permutation of range(len(seq)) and guard on that order not changing.
+        Used for generating block_ptrs.
+        """
+        seq = [*map(self.remove_precomputed_replacements, seq)]
+        seq = [(self.size_hint(var), orig_idx, var) for orig_idx, var in enumerate(seq)]
+        seq.sort()
+        order = [-1] * len(seq)
+        last_var = None
+        for new_index, (_, orig_index, var) in enumerate(seq):
+            order[orig_index] = new_index
+            if last_var is not None:
+                self.guard_leq(last_var, var)
+            last_var = var
+        return order
+    # The evaluate functions evaluate some symbolic sympy expression
+    # (NB: not necessarily an Expr) and return what the concrete result
+    # is, guarding on the expression being that result
+    # NB: write evaluate_expr(sympy.Lt(a, b)) rather than evaluate_expr(a < b)
+    # as this will ensure that you actually have a sympy'ified expression,
+    # and will prevent you from incorrectly writing evaluate_expr(a == b)
+    # which does the wrong thing if a or b is a sympy expression
+    def evaluate_expr(self, left: Union[Expr, sympy.logic.boolalg.Boolean]) -> bool:
+        assert isinstance(left, (Expr, sympy.logic.boolalg.Boolean)), type(left)
+        return self.shape_env.evaluate_expr(sympy.sympify(left))
+    def evaluate_min(self, left: Expr, right: Expr) -> Expr:
+        """return the smaller of left and right, and guard on that choice"""
+        lv = self.size_hint(left)
+        rv = self.size_hint(right)
+        if lv <= rv:
+            self.guard_leq(left, right)
+            return left
+        else:
+            self.guard_leq(right, left)
+            return right
+    def evaluate_max(self, left: Expr, right: Expr) -> Expr:
+        """return the larger of left and right, and guard on that choice"""
+        # Always choose the opposite of eval min for consistency
+        # This means min(a, b) and max(a, b) produce the same guards
+        min_val = self.evaluate_min(left, right)
+        return right if min_val is left else left
+    def evaluate_static_shape(self, left: Expr) -> int:
+        right = self.size_hint(left)
+        self.guard_equals(left, sympy.Integer(right))
+        return int(right)
+    def evaluate_static_shapes(self, left: List[Expr]) -> List[int]:
+        return [self.evaluate_static_shape(x) for x in left]
+    def remove_precomputed_replacements(self, expr: Expr) -> Expr:
+        if any(s.name.startswith("ps") for s in expr.free_symbols):  # type: ignore[attr-defined]
+            return sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        return expr
+    def symbolic_hint(self, expr: Expr) -> Expr:
+        # Substitute all hints into expr, but leave unbacked symints alone
+        if not isinstance(expr, Expr):
+            assert isinstance(expr, int)
+            return expr
+        free_symbols = expr.free_symbols
+        if not free_symbols:
+            return int(expr)  # type: ignore[return-value]
+        expr = self.remove_precomputed_replacements(expr)
+        return sympy_subs(expr, self.var_to_val)
+    def size_hint(self, expr: Expr, *, fallback: Optional[int] = None) -> int:
+        out = self.symbolic_hint(expr)
+        if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
+            # Use the provided heuristic fallback hint
+            sym_vrs = {
+                s: self.shape_env.var_to_range.get(s, None) for s in expr.free_symbols
+            }
+            if all(vr is not None for vr in sym_vrs.values()):
+                expr_vr = bound_sympy(expr, sym_vrs)  # type: ignore[arg-type]
+                lower = self.size_hint(expr_vr.lower)  # type: ignore[arg-type]
+                upper = self.size_hint(expr_vr.upper)  # type: ignore[arg-type]
+                fallback = min(max(fallback, lower), upper)
+            return fallback
+        try:
+            return int(out)
+        except Exception:
+            log.debug("failed on: %s", out)
+            raise
+    def size_hints(
+        self,
+        exprs: Iterable[Expr],
+        *,
+        fallback: Optional[int] = None,
+    ) -> Tuple[int, ...]:
+        return tuple(self.size_hint(x, fallback=fallback) for x in exprs)
+    def _lru_cache(self, fn, maxsize=None):
+        """
+        Wrapper around functools.lru_cache that clears when replacements
+        has been invalidated.
+        """
+        fn_cache = functools.lru_cache(maxsize)(fn)
+        prior_len = len(self.replacements)
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            nonlocal prior_len
+            if prior_len != len(self.replacements):
+                prior_len = len(self.replacements)
+                fn_cache.cache_clear()
+            return fn_cache(*args, **kwargs)
+        return wrapper
+    def make_stride_vars_cache(self):
+        cache = self._lru_cache(self._stride_vars)
+        def stride_vars(
+            index: Expr,
+            vars: List[sympy.Symbol],
+            support_vars: Optional[List[sympy.Symbol]] = None,
+        ) -> List[Expr]:
+            if not support_vars:
+                support_vars = vars
+            return cache(index, tuple(vars), tuple(support_vars))
+        return stride_vars
+    def _stride_vars(
+        self, index: Expr, vars: List[sympy.Symbol], support_vars: List[sympy.Symbol]
+    ) -> List[Expr]:
+        """Convert an indexing expression back into strides
+        NOTE: This is only valid if the index is a standard strided offset
+        calculation. e.g. 10 * ModularIndexing(i0 + 1, 1, 2) would give a
+        stride of -10 because the index wraps around after the first element
+        """
+        strides = []
+        index = self.simplify(index)
+        # remove any offset
+        index = index - sympy_subs(
+            index, {v: sympy.Integer(0) for v in support_vars if v != 0}
+        )
+        for i in range(len(vars)):
+            # drop all the other dims
+            index_dim = sympy_subs(
+                index,
+                {
+                    support_vars[j]: sympy.Integer(0)
+                    for j in range(len(support_vars))
+                    if vars[i] != support_vars[j] and support_vars[j] != 0
+                },
+            )
+            v = vars[i]
+            if v == 0:
+                strides.append(sympy.Integer(0))
+            else:
+                # TODO(jansel): should we use sympy.diff here?
+                strides.append(
+                    sympy_subs(index_dim, {v: sympy.Integer(1)})
+                    - sympy_subs(index_dim, {v: sympy.Integer(0)})
+                )
+        return strides
+    def offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Expr:
+        """Extract offset part of an indexing expression"""
+        index = self.simplify(index)
+        return sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
+    def stride_hints(
+        self,
+        index: Expr,
+        vars: List[sympy.Symbol],
+        support_vars: Optional[List[sympy.Symbol]] = None,
+    ) -> List[int]:
+        for v in index.free_symbols:
+            if v.name.startswith("indirect"):  # type: ignore[attr-defined]
+                index = sympy_subs(index, {v: 0})  # type: ignore[dict-item]
+        result = []
+        for s in self.stride_vars(index, vars, support_vars):
+            try:
+                result.append(self.size_hint(s))
+            except TypeError:
+                result.append(0)
+        return result
+    def stride_order(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
+        strides = tuple(map(abs, self.stride_hints(index, vars)))
+        order = list(range(len(strides)))
+        order.sort(key=lambda x: (strides[x] == 0, strides[x]))
+        return order
+    def lookup_precomputed_size(self, expr: Expr) -> Expr:
+        if (
+            isinstance(expr, (int, sympy.Symbol, sympy.Number))
+            or expr.is_number
+            or expr.is_symbol
+        ):
+            return expr
+        expr = self.remove_precomputed_replacements(expr)
+        if expr not in self.precomputed_replacements:
+            sym = sympy_index_symbol(f"ps{len(self.precomputed_replacements)}")
+            self.precomputed_replacements[expr] = sym
+            self.inv_precomputed_replacements[sym] = expr
+        return self.precomputed_replacements[expr]
+    def free_symbols(self) -> Set[sympy.Symbol]:
+        return set(self.var_to_val.keys()) - set(self.replacements.keys())
+def join_dimensions(expr: Expr) -> Expr:
+    if not isinstance(expr, sympy.Add) or not expr.has(ModularIndexing):
+        return expr  # fast exit path
+    return _join_dimensions_cached(expr)
+@functools.lru_cache(256)
+def _join_dimensions_cached(expr: Expr) -> Expr:
+    """
+    ModularIndexing(i0, 1, 32) + 32 * ModularIndexing(i0, 32, 4)
+    becomes
+    ModularIndexing(i0, 1, 128)
+    ModularIndexing(i0, 1, 32) + 32 * FloorDiv(i0, 32)
+    becomes i0
+    This type of pattern can come from view operations
+    """
+    assert isinstance(expr, sympy.Add)
+    scale = sympy.Wild("scale", exclude=[0])
+    base = sympy.Wild("base")
+    divisor = sympy.Wild("divisor")
+    mod1 = sympy.Wild("modulus")
+    mod2 = sympy.Wild("modulus2")
+    for term1 in expr.args:
+        m1 = term1.match(scale * ModularIndexing(base, divisor, mod1))
+        if m1:
+            for term2 in expr.args:
+                m2 = term2.match(
+                    m1[scale]
+                    * m1[mod1]
+                    * ModularIndexing(m1[base], m1[divisor] * m1[mod1], mod2)
+                )
+                if m2 and term1 != term2:
+                    expr = join_dimensions(
+                        expr
+                        - term1
+                        - term2
+                        + m1[scale]
+                        * ModularIndexing(m1[base], m1[divisor], m1[mod1] * m2[mod2])
+                    )
+                    return expr
+    for term1 in expr.args:
+        m1 = term1.match(scale * ModularIndexing(base, divisor, mod1))
+        if m1:
+            for term2 in expr.args:
+                m2 = term2.match(
+                    m1[scale] * m1[mod1] * FloorDiv(m1[base], m1[divisor] * m1[mod1])
+                )
+                if m2 is not None:  # in case of success we get an empty dict here
+                    expr = join_dimensions(
+                        expr
+                        - term1
+                        - term2
+                        + m1[scale] * FloorDiv(m1[base], m1[divisor])
+                    )
+                    return expr
+    return expr
+class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
+    """
+    A wrapper around .virtualize.ops that uses var range information to
+    simplify ModularIndexing/FloorDiv.
+    """
+    def __init__(self, inner, var_ranges: VarRanges):
+        super().__init__(inner)
+        self.name = "SimplifyIndexing"
+        self._simplify: Callable[
+            [Expr], Expr
+        ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
+    def load(self, name: str, index: sympy.Expr):
+        return self._inner.load(name, self._simplify(index))
+    def store(self, name, index, value, mode=None):
+        return self._inner.store(name, self._simplify(index), value, mode=mode)
+    def store_reduction(self, name, index, value):
+        return self._inner.store_reduction(name, self._simplify(index), value)
+    def index_expr(self, index, dtype):
+        return self._inner.index_expr(self._simplify(index), dtype)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/utils.py ADDED Viewed

	@@ -0,0 +1,1428 @@

+from __future__ import annotations
+import collections
+import contextlib
+import dataclasses
+import enum
+import functools
+import getpass
+import inspect
+import io
+import itertools
+import logging
+import math
+import operator
+import os
+import platform
+import re
+import shutil
+import sys
+import tempfile
+import textwrap
+import time
+import unittest
+from dataclasses import fields
+from datetime import datetime
+from io import StringIO
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Protocol,
+    Set,
+    TypeVar,
+    Union,
+    ValuesView,
+)
+from unittest import mock
+import sympy
+from typing_extensions import Concatenate, ParamSpec
+import torch
+from torch._dynamo.device_interface import get_interface_for_device
+from torch.autograd import DeviceType
+from torch.autograd.profiler_util import EventList
+from torch.utils._sympy.functions import CeilDiv, CleanDiv, FloorDiv, ModularIndexing
+from . import config
+log = logging.getLogger(__name__)
+_T = TypeVar("_T")
+VarRanges = Dict[sympy.Expr, sympy.Expr]
+def do_bench_using_profiling(fn: Callable[[], Any], warmup=25, rep=100) -> float:
+    """
+    Returns benchmark results by examining torch profiler events.
+    This could be more accurate as it doesn't count CPU side overhead.
+    However, this also requires manually excluding irrelevant event, e.g.
+    vectorized_elementwise_kernel which is used to fill L2 cache,
+    various CUDA events, etc, so could also be fragile.
+    """
+    fn()
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+    # Estimate the runtime of the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        cache.zero_()
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+    # compute number of warmup and repeat
+    n_warmup = max(1, int(warmup / estimate_ms))
+    n_repeat = max(1, int(rep / estimate_ms))
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CUDA,
+        ]
+    ) as p:
+        # Benchmark
+        for i in range(n_repeat):
+            # we clear the L2 cache before each run
+            cache.zero_()
+            # record time of `fn`
+            fn()
+        # Record clocks
+        torch.cuda.synchronize()
+    log.debug("raw events")
+    log.debug(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+    filtered_events = EventList(
+        [
+            event
+            for event in p.events()
+            if event.device_type == DeviceType.CUDA and event.name != "Context Sync"
+        ]
+    )
+    if len(filtered_events) % n_repeat != 0:
+        raise RuntimeError(
+            "Failed to divide all profiling events into #repeat groups. "
+            "#CUDA events: %d, #repeats: %s",
+            len(filtered_events),
+            n_repeat,
+        )
+    num_event_per_group = len(filtered_events) / n_repeat
+    actual_events = EventList(
+        [
+            event
+            for i, event in enumerate(filtered_events)
+            if i % num_event_per_group != 0
+        ]
+    )
+    actual_events._build_tree()
+    actual_events = actual_events.key_averages()
+    log.debug("profiling time breakdown")
+    log.debug(actual_events.table(row_limit=-1))
+    res = sum(event.cuda_time_total for event in actual_events) / 1000.0 / n_repeat
+    log.debug("profiling results: %s ms", res)
+    return res
+def do_bench(*args, **kwargs):
+    @functools.lru_cache(None)
+    def load_triton():
+        try:
+            # NB: Lazily load triton, as importing triton is slow
+            # see https://github.com/openai/triton/issues/1599
+            from triton.testing import do_bench as triton_do_bench
+        except ImportError as exc:
+            raise NotImplementedError("requires Triton") from exc
+        # triton PR https://github.com/openai/triton/pull/1513 change the
+        # quantile fields name from 'percentiles' to 'quantiles'
+        # and change the default value from (0.5, 0.2, 0.8) to None.
+        # This may break inductor since a caller expects a tuple may get a item.
+        #
+        # Add a wrapper to maintain the same behavior for inductor.
+        # Maybe we should have own implementation of this function?
+        return triton_do_bench, (
+            "quantiles"
+            if inspect.signature(triton_do_bench).parameters.get("quantiles")
+            is not None
+            else "percentiles"
+        )
+    triton_do_bench, quantile_field_name = load_triton()
+    if quantile_field_name not in kwargs:
+        kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
+    return triton_do_bench(*args, **kwargs)[0]
+@functools.lru_cache(None)
+def has_torchvision_roi_align() -> bool:
+    try:
+        from torchvision.ops import roi_align  # noqa: F401
+        return roi_align is not None and hasattr(
+            getattr(torch.ops, "torchvision", None), "roi_align"
+        )
+    except ImportError:
+        return False
+def conditional_product(*args):
+    return functools.reduce(operator.mul, [x for x in args if x])
+def decode_device(device: Union[Optional[torch.device], str]) -> torch.device:
+    if device is None:
+        return torch.tensor(0.0).device  # default device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if device.type != "cpu" and device.index is None:
+        device_interface = get_interface_for_device(device.type)
+        return torch.device(device.type, index=device_interface.Worker.current_device())
+    return device
+def sympy_product(it):
+    return functools.reduce(operator.mul, it, sympy.Integer(1))
+def sympy_dot(seq1, seq2):
+    assert len(seq1) == len(seq2)
+    return sympy.expand(sum(a * b for a, b in zip(seq1, seq2)))
+def unique(it: Iterable[_T]) -> ValuesView[_T]:
+    return {id(x): x for x in it}.values()
+def ceildiv(
+    numer: Union[int, sympy.Expr], denom: Union[int, sympy.Expr]
+) -> Union[int, sympy.Expr]:
+    if isinstance(numer, sympy.Expr) or isinstance(denom, sympy.Expr):
+        return CeilDiv(numer, denom)
+    # TODO: There is a bug in a call to this function, to repro:
+    # python benchmarks/dynamo/huggingface.py --inductor -d cuda --accuracy
+    # --amp --only YituTechConvBert --dynamic-shapes
+    assert isinstance(numer, int) and isinstance(
+        denom, int
+    ), f"{numer}: {type(numer)}, {denom}: {type(denom)}"
+    return -(numer // -denom)
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+def _type_of(key):
+    # Use the function here to get rid of dependencies on the Triton during the codegen.
+    # Refer to Triton implementation here:
+    # https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
+    # `None` is nullptr.  Implicitly convert to *i8.
+    if key is None:
+        return "*i8"
+    dtype_str = str(key).split(".")[-1]
+    tys = {
+        "bool": "i1",
+        "float8e4nv": "fp8e4nv",
+        "float8e5": "fp8e5",
+        "float8e4b15": "fp8e4b15",
+        "float8e4b15x4": "fp8e4b15x4",
+        "float8_e4m3fn": "fp8e4nv",
+        "float8_e5m2": "fp8e5",
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "float64": "fp64",
+        "int8": "i8",
+        "int16": "i16",
+        "int32": "i32",
+        "int64": "i64",
+        "uint8": "u8",
+        "uint16": "u16",
+        "uint32": "u32",
+        "uint64": "u64",
+    }
+    # reinterpret can create triton type
+    for v in list(tys.values()):
+        tys[v] = v
+    return key if isinstance(key, str) else f"*{tys[dtype_str]}"
+def convert_shape_to_inductor(
+    lst: Iterable[Union[int, torch.SymInt]]
+) -> List[sympy.Expr]:
+    """
+    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
+    trivial. But for symbolic tensors, we need to map from SymIntNode into
+    sympy.Expr.
+    """
+    return [
+        i.node.expr if isinstance(i, torch.SymInt) else sympy.Integer(i) for i in lst
+    ]
+def convert_shape_to_symint(
+    lst: Iterable[Union[int, sympy.Expr]]
+) -> List[Union[int, torch.SymInt]]:
+    """
+    Takes a list of shapes from Inductor and converts them into symints (or just
+    ints if all shapes are static).
+    """
+    from .virtualized import V
+    return [
+        i
+        if isinstance(i, int)
+        else int(i)
+        if isinstance(i, sympy.Integer)
+        else V.graph.sizevars.shape_env.create_symintnode(i, hint=None)
+        for i in lst
+    ]
+def is_view(op: torch._ops.OpOverload):
+    """
+    Does this op overload have aliasing
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    return any(a.alias_info is not None for a in op._schema.arguments)
+def is_pointwise_use(use):
+    if not use.op == "call_function":
+        return False
+    if not (
+        isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
+    ):
+        return False
+    if use.target is operator.getitem or is_view(use.target):
+        return all(is_pointwise_use(u) for u in use.users)
+    return torch.Tag.pointwise in use.target.tags
+def gen_gm_and_inputs(target, args, kwargs):
+    g = torch.fx.Graph()
+    g_args = []
+    a_args = []
+    for n, arg in enumerate(args):
+        if isinstance(arg, torch.Tensor):
+            g_args.append(g.placeholder(f"arg{n}"))
+            a_args.append(arg)
+        else:
+            g_args.append(arg)
+    assert all(not isinstance(x, torch.Tensor) for x in kwargs.values())
+    node = g.call_function(target, tuple(g_args), kwargs)
+    if (
+        len(target._schema.returns) == 1
+        and str(target._schema.returns[0].type) == "Tensor"
+    ):
+        node = (node,)
+    g.output(node)
+    gm = torch.fx.GraphModule({}, g)
+    return gm, a_args
+def synchronize(device: str = "cuda"):
+    if device == "cpu":
+        return
+    device_interface = get_interface_for_device(device)
+    if device_interface.is_available():
+        device_interface.synchronize()
+def timed(
+    model: Callable[..., Any], example_inputs, times: int = 1, device: str = "cuda"
+) -> float:
+    synchronize(device)
+    torch.manual_seed(1337)
+    t0 = time.perf_counter()
+    for _ in range(times):
+        result = model(*example_inputs)
+        synchronize(device)
+    t1 = time.perf_counter()
+    # GC the result after timing
+    assert result is not None  # type: ignore[possibly-undefined]
+    return t1 - t0
+def print_performance(
+    fn, args=(), times=10, repeat=10, baseline=1.0, device: str = "cuda"
+):
+    timings = torch.tensor([timed(fn, args, times, device) for _ in range(repeat)])
+    took = torch.median(timings) / times
+    print(f"{took/baseline:.6f}")
+    return took
+def precompute_method(obj: Any, method: str):
+    """Replace obj.method() with a new method that returns a precomputed constant."""
+    result = getattr(obj, method)()
+    setattr(obj, method, lambda: result)
+def precompute_methods(obj: Any, methods: List[str]):
+    """Replace methods with new methods that returns a precomputed constants."""
+    for method in methods:
+        precompute_method(obj, method)
+def cmp(a, b) -> int:
+    return int(a > b) - int(a < b)
+def pad_listlike(x, size):
+    if len(x) == 1:
+        return type(x)([x[0]]) * size
+    else:
+        return x
+# Used to ensure that iterating over a set is deterministic
+def tuple_sorted(x):
+    if len(x) == 0:
+        return []
+    def sort_func(elem):
+        if isinstance(elem, str):
+            return elem
+        else:
+            # We expect `elem` to be `scheduler.BaseSchedulerNode` type here,
+            # but we are not able to do isinstance assert because of circular dependency
+            return elem.get_name()
+    return sorted(x, key=sort_func)
+P = ParamSpec("P")
+RV = TypeVar("RV", covariant=True)
+class CachedMethod(Generic[P, RV], Protocol):
+    @staticmethod
+    def clear_cache(self) -> None:
+        ...
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> RV:
+        ...
+# See https://github.com/python/mypy/issues/13222#issuecomment-1193073470 to understand the type signature
+def cache_on_self(fn: Callable[Concatenate[Any, P], RV]) -> CachedMethod[P, RV]:
+    key = f"__{fn.__name__}_cache"
+    @functools.wraps(fn)
+    def wrapper(self):
+        if not hasattr(self, key):
+            setattr(self, key, fn(self))
+        return getattr(self, key)
+    def clear_cache(self):
+        if hasattr(self, key):
+            delattr(self, key)
+    wrapper.clear_cache = clear_cache  # type: ignore[attr-defined]
+    return wrapper  # type: ignore[return-value]
+def aggregate_origins(node_schedule):
+    from . import ir
+    if isinstance(node_schedule, list):
+        return functools.reduce(
+            operator.or_,
+            [
+                node.node.origins
+                for node in node_schedule
+                if hasattr(node, "node") and node.node
+            ],
+            set(),
+        )
+    elif isinstance(node_schedule, ir.ExternKernel):
+        return node_schedule.origins
+    else:
+        return set()
+def get_fused_kernel_name(node_schedule, descriptive_names):
+    all_origins = aggregate_origins(node_schedule)
+    if descriptive_names == "original_aten":
+        # Bases the kernel name off of the top-level aten operator (i.e. pre-decompositions)
+        sources = [
+            origin.meta["original_aten"]._overloadpacket.__name__
+            for origin in all_origins
+            if origin.op == "call_function"
+            and "original_aten" in origin.meta
+            and origin.meta["original_aten"] is not None
+        ]
+        sources = sorted(set(sources))
+    elif descriptive_names == "torch":
+        # Bases the kernel name off of the top-level "torch" operator (i.e. post-dynamo graph)
+        sources = []
+        for origin in all_origins:
+            if origin.op == "call_function" and "source_fn_stack" in origin.meta:
+                source_fn = origin.meta["source_fn_stack"][-1]
+                if isinstance(source_fn[1], str):
+                    sources.append(source_fn[1])
+                else:
+                    sources.append(source_fn[1].__name__)
+        sources = sorted(set(sources))
+    elif descriptive_names == "inductor_node":
+        sources = [
+            origin.name for origin in all_origins if origin.op == "call_function"
+        ]
+    else:
+        raise NotImplementedError
+    sources = sources
+    return "_".join(["fused"] + sources)
+def get_kernel_metadata(node_schedule, wrapper):
+    all_origins = aggregate_origins(node_schedule)
+    inductor_nodes = [origin for origin in all_origins if origin.op == "call_function"]
+    from_node_dict = collections.defaultdict(list)
+    original_aten_dict = collections.defaultdict(list)
+    for node in inductor_nodes:
+        if "original_aten" in node.meta and node.meta["original_aten"] is not None:
+            key = str(node.meta["original_aten"]._overloadpacket)
+            original_aten_dict[key].append(node.name)
+        if "from_node" in node.meta:
+            key = node.meta["from_node"][0][0]
+            from_node_dict[key].append(node.name)
+    metadata = (
+        f"{wrapper.comment} Source Nodes: [{', '.join(sorted(from_node_dict.keys()))}], "
+        f"Original ATen: [{', '.join(sorted(original_aten_dict.keys()))}]"
+    )
+    # trace back to original node here
+    detailed_metadata = []
+    for original_node, nodes in sorted(from_node_dict.items()):
+        detailed_metadata.append(
+            f"{wrapper.comment} {original_node} => {', '.join(sorted(nodes))}"
+        )
+    return metadata, "\n".join(detailed_metadata)
+def dominated_nodes(
+    initial_queue: Iterable[torch.fx.Node], skip_filter=None
+) -> Set[torch.fx.Node]:
+    """Returns the set of nodes whose values depend on those within initial_queue"""
+    initial_queue = list(initial_queue)
+    dominated_set = set(initial_queue)
+    while initial_queue:
+        node = initial_queue.pop()
+        for user in node.users:
+            if skip_filter and skip_filter(user):
+                continue
+            if user not in dominated_set:
+                dominated_set.add(user)
+                initial_queue.append(user)
+    return dominated_set
+def gather_origins(args, kwargs):
+    import itertools
+    from . import ir
+    def is_unrealized_node(n):
+        if isinstance(n, ir.TensorBox):
+            return is_unrealized_node(n.data)
+        if isinstance(n, ir.StorageBox):
+            return is_unrealized_node(n.data)
+        return isinstance(n, ir.IRNode) and isinstance(n, ir.Pointwise)
+    kwarg_origins = [val.origins for val in kwargs.values() if is_unrealized_node(val)]
+    arg_origins = [arg.origins for arg in args if is_unrealized_node(arg)]
+    return set(itertools.chain(*arg_origins, *kwarg_origins))
+def sympy_str(expr: sympy.Expr) -> str:
+    """
+    Normal sympy str is very slow, this is a lot faster.  The result are
+    somewhat worse, as it doesn't do as much simplification.  So don't
+    use this for final codegen.
+    """
+    if isinstance(expr, sympy.Symbol):
+        return expr.name
+    if isinstance(expr, sympy.Add):
+        return " + ".join(map(sympy_str, expr.args))
+    if isinstance(expr, sympy.Mul):
+        return " * ".join(map(sympy_str, expr.args))
+    if isinstance(expr, (ModularIndexing, CleanDiv, FloorDiv)):
+        return f"{expr.func.__name__}({', '.join(map(sympy_str, expr.args))})"
+    return str(expr)
+def sympy_index_symbol(name: str) -> sympy.Symbol:
+    """
+    Used to generate an integer-nonnegative symbol.
+    """
+    # This should never be used for creating shape/stride symbols, as those
+    # should all be allocated before Inductor.
+    assert name[0] != "s"
+    # NOTE: shape symbols are positive (> 0), but index variables are only
+    # non-negative (>= 0).
+    return sympy.Symbol(name, integer=True, nonnegative=True)
+def sympy_subs(expr: sympy.Expr, replacements: Dict[sympy.Expr, Any]) -> sympy.Expr:
+    """
+    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
+    have the same replaced expression integer and nonnegative properties.
+    """
+    def to_symbol(replaced, replacement):
+        assert isinstance(replaced, sympy.Expr)
+        if isinstance(replacement, str):
+            return sympy.Symbol(
+                replacement,
+                integer=replaced.is_integer,  # type: ignore[attr-defined]
+                nonnegative=replaced.is_nonnegative,  # type: ignore[attr-defined]
+            )
+        else:
+            return replacement
+    # xreplace is faster than subs, but is way more picky
+    return sympy.sympify(expr).xreplace(
+        {k: to_symbol(k, v) for k, v in replacements.items()}
+    )
+def free_symbol_startswith(index: sympy.Expr, prefix: str):
+    return any(v.name.startswith(prefix) for v in index.free_symbols)  # type: ignore[attr-defined]
+def free_symbol_has(index: sympy.Expr, pattern: str):
+    return any(pattern in v.name for v in index.free_symbols)  # type: ignore[attr-defined]
+def is_symbolic(a: Any) -> bool:
+    return isinstance(a, torch.SymInt) or (
+        isinstance(a, torch.Tensor)
+        and any(is_symbolic(x) for x in itertools.chain(a.size(), a.stride()))
+    )
+def any_is_symbolic(*args: Any) -> bool:
+    return any(is_symbolic(a) for a in args)
+def has_incompatible_cudagraph_ops(gm):
+    from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+    forbidden_set = {
+        "aten._fused_moving_avg_obs_fq_helper.default",
+        "aten._fused_moving_avg_obs_fq_helper_functional.default",
+        "aten.multinomial.default",
+        "fbgemm.dense_to_jagged.default",
+        "fbgemm.jagged_to_padded_dense.default",
+        "run_and_save_rng_state",
+        "run_with_rng_state",
+        "aten._local_scalar_dense",
+        # Technically, it's not necessary to ban this, because an
+        # assert_scalar with constant arguments can be validly run
+        # with CUDA graphs, but the operator is also pointless with
+        # constant arguments, so might as well ban
+        "aten._assert_scalar",
+    }
+    if torch.are_deterministic_algorithms_enabled():
+        forbidden_set.update(
+            {
+                "aten._unsafe_index_put.default",
+                "aten.index_put.default",
+                "aten.index_put_.default",
+                "aten.scatter.src",
+                "aten.scatter.reduce",
+                "aten.scatter.value_reduce",
+                "aten.scatter_add_",
+                "aten.scatter_add.default",
+                "aten.scatter_reduce.two",
+                "aten.scatter_reduce_.two",
+                "aten.scatter_reduce.two_out",
+            }
+        )
+    for node in gm.graph.nodes:
+        if str(node.target) in forbidden_set:
+            return True
+        if (val := node.meta.get("val")) is not None and free_unbacked_symbols(val):
+            return True
+    return False
+def output_node(gm: torch.fx.GraphModule):
+    """Get the output node from an FX graph"""
+    last_node = next(iter(reversed(gm.graph.nodes)))
+    assert last_node.op == "output"
+    return last_node
+# Attempt to import AttrsDescriptor from Triton
+try:
+    from triton.compiler.compiler import AttrsDescriptor
+    attrs_descriptor_available = True
+    # Determine if 'ids_of_folded_args' is a valid field for AttrsDescriptor
+    attr_desc_fields = {f.name for f in fields(AttrsDescriptor)}
+    ids_of_folded_args_available = "ids_of_folded_args" in attr_desc_fields
+    divisible_by_8_available = "divisible_by_8" in attr_desc_fields
+except ImportError:
+    attrs_descriptor_available = False
+# Define `instance_descriptor` function with clear conditional handling
+if attrs_descriptor_available:
+    def instance_descriptor(
+        divisible_by_16=None,
+        equal_to_1=None,
+        ids_of_folded_args=None,
+        divisible_by_8=None,
+    ):
+        # Prepare the arguments for AttrsDescriptor
+        kwargs = {
+            "divisible_by_16": divisible_by_16,
+            "equal_to_1": equal_to_1,
+        }
+        # Conditionally add 'ids_of_folded_args' if it's available in AttrsDescriptor
+        if ids_of_folded_args_available:
+            kwargs["ids_of_folded_args"] = ids_of_folded_args
+        if divisible_by_8_available:
+            kwargs["divisible_by_8"] = divisible_by_8
+        # Instantiate AttrsDescriptor with the prepared arguments
+        return AttrsDescriptor(**kwargs)
+else:
+    # Define a namedtuple as a fallback when AttrsDescriptor is not available
+    instance_descriptor = collections.namedtuple(  # type: ignore[no-redef]
+        "instance_descriptor",
+        ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"],
+        defaults=[tuple(), tuple(), tuple(), tuple()],
+    )
+@functools.lru_cache(None)
+def cache_dir() -> str:
+    cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
+    if cache_dir is None:
+        sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
+        cache_dir = os.path.join(
+            tempfile.gettempdir(),
+            "torchinductor_" + sanitized_username,
+        )
+    os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
+@contextlib.contextmanager
+def fresh_inductor_cache(cache_entries=None):
+    """
+    Contextmanager that provides a clean tmp cachedir for inductor.
+    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
+    generated with this cache instance.
+    """
+    with tempfile.TemporaryDirectory() as inductor_cache_dir:
+        with mock.patch.dict(
+            os.environ, {"TORCHINDUCTOR_CACHE_DIR": inductor_cache_dir}
+        ):
+            triton_cache_dir = os.path.join(inductor_cache_dir, "triton")
+            with mock.patch.dict(os.environ, {"TRITON_CACHE_DIR": triton_cache_dir}):
+                yield
+                if isinstance(cache_entries, dict):
+                    assert len(cache_entries) == 0, "expected empty cache_entries dict"
+                    if os.path.exists(triton_cache_dir):
+                        files = os.listdir(triton_cache_dir)
+                        cache_entries.update(
+                            {
+                                f: os.path.getsize(os.path.join(triton_cache_dir, f))
+                                for f in files
+                                if ".lock" not in f
+                            }
+                        )
+def argsort(seq) -> List[int]:
+    # preserve original order for equal strides
+    getter = seq.__getitem__
+    a_r = range(len(seq))
+    return list(reversed(sorted(a_r, key=getter, reverse=True)))  # noqa: C413
+@functools.lru_cache(8)
+def get_dtype_size(dtype):
+    return torch.empty((), dtype=dtype).element_size()
+class LineContext(NamedTuple):
+    context: Any
+class IndentedBuffer:
+    tabwidth = 4
+    def __init__(self, initial_indent=0):
+        self._lines = []
+        self._indent = initial_indent
+    def getvaluewithlinemap(self) -> tuple[str, list[tuple[int, LineContext]]]:
+        buf = StringIO()
+        p = 1
+        linemap = []
+        for line in self._lines:
+            if isinstance(line, DeferredLineBase):
+                line = line()
+                if line is None:
+                    continue
+            elif isinstance(line, LineContext):
+                linemap.append((p, line.context))
+                continue
+            assert isinstance(line, str)
+            buf.write(line)
+            buf.write("\n")
+            p += 1 + line.count("\n")
+        return buf.getvalue(), linemap
+    def getvalue(self) -> str:
+        v, _ = self.getvaluewithlinemap()
+        return v
+    def getrawvalue(self) -> str:
+        buf = StringIO()
+        for line in self._lines:
+            if isinstance(line, DeferredLineBase):
+                line = line()
+                if line is None:
+                    continue
+            elif isinstance(line, LineContext):
+                continue
+            assert isinstance(line, str)
+            # backslash implies line continuation
+            if line.endswith("\\"):
+                buf.write(line[:-1])
+            else:
+                buf.write(line)
+                buf.write("\n")
+        return buf.getvalue()
+    def clear(self):
+        self._lines.clear()
+    def __bool__(self):
+        return bool(self._lines)
+    def prefix(self):
+        return " " * (self._indent * self.tabwidth)
+    def newline(self):
+        self.writeline("\n")
+    def writeline(self, line):
+        if isinstance(line, LineContext):
+            self._lines.append(line)
+        elif isinstance(line, DeferredLineBase):
+            self._lines.append(line.with_prefix(self.prefix()))
+        elif line.strip():
+            self._lines.append(f"{self.prefix()}{line}")
+        else:
+            self._lines.append("")
+    def writelines(self, lines):
+        for line in lines:
+            self.writeline(line)
+    def indent(self, offset=1):
+        @contextlib.contextmanager
+        def ctx():
+            self._indent += offset
+            try:
+                yield
+            finally:
+                self._indent -= offset
+        return ctx()
+    def do_indent(self, offset=1):
+        self._indent += offset
+    def do_unindent(self, offset=1):
+        self._indent -= offset
+    def splice(self, other_code, strip=False):
+        if isinstance(other_code, IndentedBuffer):
+            dedent = float("inf")
+            for line in other_code._lines:
+                if not isinstance(line, LineContext) and line:
+                    dedent = min(dedent, len(line) - len(line.lstrip()))
+            if math.isinf(dedent):
+                dedent = 0
+            for line in other_code._lines:
+                if isinstance(line, LineContext):
+                    self._lines.append(line)
+                else:
+                    IndentedBuffer.writeline(self, line[int(dedent) :])
+        else:
+            other_code = textwrap.dedent(other_code)
+            if strip:
+                other_code = other_code.lstrip()
+            if not other_code:
+                return
+            other_code = other_code.rstrip()
+            for line in other_code.split("\n"):
+                self.writeline(line)
+    def __repr__(self):
+        return f"{type(self)}({self.getvalue()})"
+class DeferredLineBase:
+    """A line that can be 'unwritten' at a later time"""
+    def __init__(self, line):
+        if not line.strip():
+            line = ""
+        self.line = line
+    def __call__(self) -> Optional[str]:
+        """Returns either self.line or None to indicate the line has been 'unwritten'"""
+        raise NotImplementedError()
+    def _new_line(self, line: str) -> DeferredLineBase:
+        """Returns a new deferred line with the same condition"""
+        raise NotImplementedError()
+    def with_prefix(self, prefix):
+        return self._new_line(f"{prefix}{self.line}")
+    def lstrip(self):
+        return self._new_line(self.line.lstrip())
+    def __getitem__(self, index):
+        return self._new_line(self.line[index])
+    def __bool__(self):
+        return bool(self.line)
+    def __len__(self):
+        return len(self.line)
+@functools.lru_cache(None)
+def is_big_gpu(index):
+    sms = torch.cuda.get_device_properties(index).multi_processor_count
+    if sms < 80:  # V100
+        log.warning("not enough SMs to use max_autotune_gemm mode")
+        return False
+    return True
+def use_max_autotune() -> bool:
+    return (
+        config.max_autotune or config.max_autotune_gemm or config.search_autotune_cache
+    )
+def _use_template_for_cuda(layout, allowed_layout_dtypes: List[torch.dtype]) -> bool:
+    return (
+        use_max_autotune()
+        and layout.device.type == "cuda"
+        and layout.dtype in allowed_layout_dtypes
+        and is_big_gpu(layout.device.index or 0)
+    )
+def _use_autotune_backend(backend: str) -> bool:
+    return backend.upper() in [
+        x.strip() for x in config.max_autotune_gemm_backends.upper().split(",")
+    ]
+def use_triton_template(layout, *, enable_int32=False):
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    if enable_int32:
+        layout_dtypes = [torch.float16, torch.bfloat16, torch.float32, torch.int32]
+    return _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend(
+        "TRITON"
+    )
+def use_cutlass_template(layout):
+    from .codegen.cuda.cutlass_utils import try_import_cutlass
+    # Do not use cutlass template on ROCm
+    if torch.version.hip:
+        return False
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    res = _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend(
+        "CUTLASS"
+    )
+    if res:
+        if not try_import_cutlass():
+            log.warning(
+                "Failed to import CUTLASS lib. Please check whether "
+                "_inductor.config.cuda.cutlass_dir is set correctly. "
+                "Skipping CUTLASS backend for now."
+            )
+            return False
+    return res
+def use_aten_gemm_kernels():
+    return not use_max_autotune() or _use_autotune_backend("ATEN")
+class DebugDirManager:
+    counter = itertools.count(0)
+    prev_debug_name: str
+    def __init__(self):
+        self.id = next(DebugDirManager.counter)
+    def __enter__(self):
+        self.prev_debug_name = torch._dynamo.config.debug_dir_root
+        self.new_name = f"{self.prev_debug_name}_tmp_{self.id}"
+        torch._dynamo.config.debug_dir_root = self.new_name
+    def __exit__(self, *args):
+        shutil.rmtree(self.new_name)
+        torch._dynamo.config.debug_dir_root = self.prev_debug_name
+def run_and_get_code(fn, *args, **kwargs):
+    from .graph import GraphLowering
+    compile_to_module = GraphLowering.compile_to_module
+    source_codes = []
+    def patched_compile_to_module(self):
+        mod = compile_to_module(self)
+        with open(mod.__file__) as f:
+            source_codes.append(f.read())
+        return mod
+    # If FX code caching is enabled, a hit prevents getting the code.
+    with config.patch({"fx_graph_cache": False}):
+        with mock.patch.object(
+            GraphLowering, "compile_to_module", patched_compile_to_module
+        ):
+            torch._dynamo.reset()
+            result = fn(*args, **kwargs)
+    return result, source_codes
+def run_and_get_triton_code(fn, *args, **kwargs):
+    _, source_codes = run_and_get_code(fn, *args, **kwargs)
+    # Can have two outputs if backwards was eagerly compiled
+    assert (
+        1 <= len(source_codes) <= 2
+    ), f"expected one or two code outputs got {len(source_codes)}"
+    return source_codes[0]
+@contextlib.contextmanager
+def override_lowering(aten_op, override_fn):
+    """
+    Override the lowering of aten_op with override_fn.
+    The first argument of override_fn is the original lowering fn.
+    """
+    from torch._inductor import lowering
+    orig_fn = lowering.lowerings[aten_op]
+    try:
+        lowering.lowerings[aten_op] = functools.partial(override_fn, orig_fn)
+        yield
+    finally:
+        lowering.lowerings[aten_op] = orig_fn
+def add_scheduler_init_hook(pre_fn, post_fn=None):
+    """
+    Add hook functions to be called at the beginning and end of Scheduler.__init__.
+    Used for unit tests.
+    """
+    from torch._inductor.scheduler import Scheduler
+    orig_fn = Scheduler.__init__
+    def wrapper(scheduler, nodes):
+        pre_fn(scheduler, nodes)
+        out = orig_fn(scheduler, nodes)
+        if post_fn:
+            post_fn(scheduler, nodes)
+        return out
+    return unittest.mock.patch.object(Scheduler, "__init__", wrapper)
+def developer_warning(msg):
+    """
+    Warnings that will be actionable for PyTorch developers, but not
+    end users.  Allows us to easily disable them in stable releases but
+    keep them on for nightly builds.
+    """
+    if config.developer_warnings:
+        log.warning(msg)
+    else:
+        log.info(msg)
+def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int:
+    """
+    Return the total number of bytes the arguments of tensor type takes.
+    For in/out args, tensor sizes are counted twice: once for reading and
+    once for writing.
+    The first num_in_out_args arguments are in out tensors.
+    """
+    return sum(
+        arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args))
+        for i, arg in enumerate(args)
+        if isinstance(arg, torch.Tensor)
+    )
+def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix="", color=True):
+    info_str = f"{prefix}{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}"
+    slow = ms > 0.012 and gb_per_s < 650
+    return red_text(info_str) if color and slow else info_str
+def get_benchmark_name():
+    """
+    An experimental API used only when config.benchmark_kernel is true.
+    The benchmark name is only available at codegen time. So we can not
+    directly call it in benchmark_all_kernels which is run after codegen.
+    The function assumes the argument after --only is the benchmark name.
+    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
+    scripts, this function may return None.
+    There are 2 flavors of --only argument we need handle:
+    1. --only model_name
+    2. --only=model_name
+    """
+    try:
+        idx = sys.argv.index("--only")
+        if (
+            idx + 1 < len(sys.argv)
+            and len(sys.argv[idx + 1]) > 0
+            and sys.argv[idx + 1][0] != "-"
+        ):
+            return sys.argv[idx + 1]
+    except ValueError:
+        pass
+    for arg in sys.argv:
+        if arg.startswith("--only="):
+            return arg[len("--only=") :]
+def is_ones(items):
+    return all(x == 1 for x in items)
+def is_zeros(items):
+    return all(x == 0 for x in items)
+def is_cpu_device(inputs):
+    return all(
+        item.device == torch.device("cpu")
+        for item in inputs
+        if isinstance(item, torch.Tensor)
+    )
+def get_sympy_Expr_dtype(val: sympy.Expr) -> torch.dtype:
+    assert isinstance(
+        val, sympy.Expr
+    ), "only support sympy.Expr as input to get_sympy_Expr_dtype"
+    if val.is_integer:  # type: ignore[attr-defined]
+        return torch.int64
+    else:
+        return torch.float64
+@contextlib.contextmanager
+def maybe_profile(should_profile, *args, **kwargs):
+    if should_profile:
+        with torch.profiler.profile(*args, **kwargs) as p:
+            yield p
+    else:
+        yield
+def triton_config_to_hashable(cfg):
+    """
+    Convert triton config to a tuple that can uniquely identify it. We can use
+    the return value as a dictionary key.
+    """
+    items = sorted(cfg.kwargs.items())
+    items.append(("num_warps", cfg.num_warps))
+    items.append(("num_stages", cfg.num_stages))
+    return tuple(items)
+def parallel_num_threads():
+    threads = config.cpp.threads
+    if threads < 1:
+        threads = torch.get_num_threads()
+    return threads
+HAS_COLORAMA = True
+try:
+    import colorama
+except ImportError:
+    HAS_COLORAMA = False
+def _color_text(msg, color):
+    if not HAS_COLORAMA:
+        return msg
+    return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
+def green_text(msg):
+    return _color_text(msg, "green")
+def yellow_text(msg):
+    return _color_text(msg, "yellow")
+def red_text(msg):
+    return _color_text(msg, "red")
+def blue_text(msg):
+    return _color_text(msg, "blue")
+@functools.lru_cache(None)
+def get_device_tflops(dtype):
+    from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
+    assert dtype in (torch.float16, torch.bfloat16, torch.float32)
+    if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
+        # Triton API change in https://github.com/openai/triton/pull/2293
+        from torch._utils_internal import max_clock_rate
+        sm_clock = max_clock_rate()
+        if dtype in (torch.float16, torch.bfloat16):
+            return get_max_tensorcore_tflops(dtype, sm_clock)
+        if torch.backends.cuda.matmul.allow_tf32:
+            return get_max_tensorcore_tflops(torch.float32, sm_clock)
+        else:
+            return get_max_simd_tflops(torch.float32, sm_clock)
+    else:
+        if dtype in (torch.float16, torch.bfloat16):
+            return get_max_tensorcore_tflops(dtype)
+        if torch.backends.cuda.matmul.allow_tf32:
+            return get_max_tensorcore_tflops(torch.float32)
+        else:
+            return get_max_simd_tflops(torch.float32)
+@functools.lru_cache(None)
+def get_gpu_dram_gbps():
+    from triton.testing import get_dram_gbps
+    return get_dram_gbps()
+def is_welford_reduction(reduction_type):
+    return reduction_type.startswith("welford")
+def reduction_num_outputs(reduction_type):
+    return 3 if is_welford_reduction(reduction_type) else 1
+def get_max_y_grid():
+    return 65535
+def is_linux() -> bool:
+    return platform.system() == "Linux"
+def has_free_symbols(itr: Iterable[Any]):
+    return any(isinstance(x, sympy.Expr) and not x.is_number for x in itr)
+def is_dynamic(*args):
+    from . import ir
+    for t in args:
+        if isinstance(t, ir.TensorBox):
+            if has_free_symbols(t.data.get_size()) or (
+                hasattr(t.data, "get_stride") and has_free_symbols(t.data.get_stride())
+            ):
+                return True
+        elif isinstance(t, (ir.StorageBox, ir.BaseView, ir.ComputedBuffer)):
+            assert hasattr(t, "get_size") and hasattr(t, "get_stride")
+            if has_free_symbols(t.get_size()) or has_free_symbols(t.get_stride()):
+                return True
+        elif not isinstance(t, ir.IRNode):
+            continue
+        else:
+            raise TypeError(f"unexpected type for is_dynamic {type(t)}")
+    return False
+# Placeholder strings used in triton codegen.
+class Placeholder(enum.Enum):
+    # The placeholder for the actual name of a triton kernel.
+    # e.g. for "def triton_" it would be "triton_"
+    KERNEL_NAME = "KERNEL_NAME"
+    # The descriptive name of the triton kernel; when unique_kernel_names = False, this
+    # placeholder will be replaced with a string with more information.
+    DESCRIPTIVE_NAME = "DESCRIPTIVE_NAME"
+def pass_execution_and_save(func, gm, msg):
+    from .pattern_matcher import stable_topological_sort
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        encoding="utf-8",
+        delete=False,
+    ) as f:
+        before_io = io.StringIO()
+        after_io = io.StringIO()
+        print(f"Before:\n{gm.graph}", file=f)
+        print(gm.graph, file=before_io)
+        start_time = datetime.now()
+        func(gm.graph)
+        time_elapsed = datetime.now() - start_time
+        # recompile graph
+        stable_topological_sort(gm.graph)
+        gm.graph.lint()
+        gm.recompile()
+        print(f"After:\n{gm.graph}", file=f)
+        print(gm.graph, file=after_io)
+        t = before_io.getvalue() == after_io.getvalue()
+        log.info(
+            "%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s",
+            msg,
+            f.name,
+            t,
+            time_elapsed,
+        )
+def is_collective(node):
+    from . import ir
+    return isinstance(node, ir.CollectiveKernel) or type(node) == ir._CollectiveKernel
+def is_wait(node):
+    from . import ir
+    return isinstance(node, ir.Wait) or type(node) == ir._WaitKernel
+def num_fw_fixed_arguments(dynamo_gm_num_inputs: int, aot_fw_gm_num_inputs: int):
+    "Computes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)"
+    num_rng_seed_offset_inputs = (
+        2 if torch._functorch.config.functionalize_rng_ops else 0
+    )
+    return aot_fw_gm_num_inputs - dynamo_gm_num_inputs - num_rng_seed_offset_inputs
+def count_tangents(fx_g: torch.fx.GraphModule):
+    """
+    Infers which inputs are static for a backwards graph
+    """
+    def is_saved_tensor(x):
+        return (
+            "tangents" not in x.name
+            and "bwd_seed" not in x.name
+            and "bwd_base_offset" not in x.name
+        )
+    arg_count = 0
+    static_arg_idxs = []
+    for n in fx_g.graph.nodes:
+        if n.op == "placeholder":
+            if is_saved_tensor(n):
+                static_arg_idxs.append(arg_count)
+            arg_count += 1
+    assert static_arg_idxs == list(range(len(static_arg_idxs)))
+    return len(static_arg_idxs)
+@dataclasses.dataclass
+class BoxedBool:
+    value: bool
+    def __bool__(self):
+        return self.value
+    @staticmethod
+    def disable(obj):
+        if isinstance(obj, BoxedBool):
+            obj.value = False
+            return obj
+        return False
+@contextlib.contextmanager
+def collect_defined_kernels(kernel_list):
+    from .codegen.wrapper import WrapperCodeGen
+    orig_define_kernel = WrapperCodeGen.define_kernel
+    def new_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs):
+        nonlocal kernel_list
+        kernel_list.append(kernel_code)
+        return orig_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs)
+    with unittest.mock.patch.object(WrapperCodeGen, "define_kernel", new_define_kernel):
+        yield

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/wrapper_benchmark.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import dataclasses
+import tempfile
+from collections import defaultdict
+import torch
+from torch.autograd import DeviceType
+from .utils import create_bandwidth_info_str, do_bench, get_num_bytes
+_kernel_category_choices = [
+    "foreach",
+    "persistent_reduction",
+    "pointwise",
+    "reduction",
+    "split_scan",
+    "template",
+]
+def get_kernel_category_by_source_code(src_code):
+    """
+    Similar to get_kernel_category but use the source code. Call this API
+    if we have not compile the src_code to module yet.
+    """
+    choices = [
+        ch for ch in _kernel_category_choices if f"@triton_heuristics.{ch}" in src_code
+    ]
+    if len(choices) == 1:
+        return choices[0]
+    else:
+        return "unknown"
+def get_kernel_category(kernel_mod):
+    """
+    Given the module defining a triton kernel, return the category of the kernel.
+    Category can be one of:
+    - pointwise
+    - reduction
+    - persistent_reduction
+    Currently we simply decide the category depending on what decorator is imported
+    by the kernel.
+    """
+    choices = [ch for ch in _kernel_category_choices if ch in kernel_mod.__dict__]
+    if len(choices) == 1:
+        return choices[0]
+    else:
+        return "unknown"
+def get_triton_kernel(mod):
+    from torch._inductor.triton_heuristics import CachingAutotuner
+    cand_list = [
+        v
+        for k, v in mod.__dict__.items()
+        if k.startswith("triton_") and isinstance(v, CachingAutotuner)
+    ]
+    assert len(cand_list) == 1
+    return cand_list[0]
+def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
+    """
+    An experimental API used only when config.benchmark_kernel is true.
+    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
+    Used in the compiled modules.
+    Put this method here rather than codegen it for convenience since its implementation
+    does not change based on different graph modules being compiled.
+    """
+    from torch._inductor.codecache import PyCodeCache
+    nfound = 0
+    for kernel_key, kernel_mod in PyCodeCache.cache.items():
+        if not hasattr(kernel_mod, "get_args") or not hasattr(kernel_mod, "call"):
+            continue
+        triton_kernel = get_triton_kernel(kernel_mod)
+        kernel_category = get_kernel_category(kernel_mod)
+        args = kernel_mod.get_args()
+        num_in_out_ptrs = len(
+            [
+                arg_name
+                for arg_name in triton_kernel.fn.arg_names
+                if arg_name.startswith("in_out_ptr")
+            ]
+        )
+        num_gb = triton_kernel.inductor_meta.get("kernel_num_gb", None)
+        if num_gb is None:
+            num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+        def get_info_str(ms, n_regs, n_spills, shared, prefix=""):
+            if not any(x is None for x in [n_regs, n_spills, shared]):
+                kernel_detail_str = (
+                    f"  {n_regs:3} regs  {n_spills:3} spills  {shared:8} shared mem"
+                )
+            else:
+                kernel_detail_str = ""
+            gb_per_s = num_gb / (ms / 1e3)
+            return create_bandwidth_info_str(
+                ms, num_gb, gb_per_s, prefix=prefix, suffix=kernel_detail_str
+            )
+        kernel_desc = (
+            f"{benchmark_name:20} {kernel_category[:3].upper()} {kernel_key[:10]}"
+        )
+        if benchmark_all_configs:
+            assert hasattr(kernel_mod, "benchmark_all_configs")
+            bench_result = kernel_mod.benchmark_all_configs(args)
+            print(kernel_desc)
+            for launcher, ms in bench_result.items():
+                print(
+                    f"  {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
+                )
+        else:
+            ms = do_bench(lambda: kernel_mod.call(args), rep=40, fast_flush=True)
+            assert (
+                len(triton_kernel.launchers) == 1
+            ), "Autotuner should have selected the best config"
+            launcher = triton_kernel.launchers[0]
+            print(
+                get_info_str(
+                    ms,
+                    launcher.n_regs,
+                    launcher.n_spills,
+                    launcher.shared,
+                    prefix=f"{kernel_desc} ",
+                )
+            )
+        nfound += 1
+    if nfound == 0:
+        print(
+            "No kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True"
+        )
+@dataclasses.dataclass
+class ProfileEvent:
+    category: str
+    key: str
+    self_cuda_time_ms: float
+    # the benchmark is run multiple times and we average the count across all the
+    # runs. It should be an integer but define a float just in case.
+    count: float
+def parse_profile_event_list(benchmark_name, event_list, wall_time_ms, nruns):
+    def get_self_cuda_time(ev):
+        """
+        ev.self_cuda_time_total is in microsecond. Convert to millisecond.
+        """
+        return ev.self_cuda_time_total / 1000 / nruns
+    all_events = defaultdict(list)
+    def add_event(ev, category):
+        profile_ev = ProfileEvent(
+            category=category,
+            key=ev.key,
+            self_cuda_time_ms=get_self_cuda_time(ev),
+            count=ev.count / nruns,  # average across all runs
+        )
+        all_events[category].append(profile_ev)
+    for ev in event_list:
+        assert not ev.is_legacy, "Don't support the legacy profiler"
+        if ev.device_type == DeviceType.CPU:
+            # ignore the event on CPU side
+            continue
+        category = "unknown"
+        if ev.key.startswith("triton_"):
+            if ev.key.startswith("triton_poi"):
+                category = "triton_pointwise"
+            elif ev.key.startswith("triton_red"):
+                category = "triton_reduction"
+            elif ev.key.startswith("triton_per"):
+                category = "triton_persistent_reduction"
+            else:
+                category = "triton_unknown"
+        add_event(ev, category)
+    def report_category(category, profile_events):
+        from tabulate import tabulate
+        profile_events.sort(key=lambda ev: ev.self_cuda_time_ms, reverse=True)
+        rows = []
+        total_time = 0.0
+        print(f"\n  == {category} category kernels == ")
+        for ev in profile_events:
+            total_time += ev.self_cuda_time_ms
+            percent = f"{ev.self_cuda_time_ms / wall_time_ms * 100:.2f}%"
+            rows.append([ev.key[:120], ev.self_cuda_time_ms, ev.count, percent])
+        rows.append(
+            ["Total", total_time, "", f"{total_time / wall_time_ms * 100:.2f}%"]
+        )
+        print(
+            tabulate(
+                rows, headers=["Kernel", "Self CUDA TIME (ms)", "Count", "Percent"]
+            )
+        )
+        return total_time
+    def report():
+        category_list = [
+            "triton_pointwise",
+            "triton_reduction",
+            "triton_persistent_reduction",
+            "triton_unknown",
+            "unknown",
+        ]
+        assert set(all_events.keys()).issubset(
+            set(category_list)
+        ), f"{list(all_events.keys())}"
+        per_category_wall_time = {}
+        total_cuda_ms = 0.0
+        for category in category_list:
+            if category in all_events:
+                _time = report_category(category, all_events[category])
+                per_category_wall_time[category] = _time
+                total_cuda_ms += _time
+        gpu_busy_percent = f"{total_cuda_ms / wall_time_ms * 100:.2f}%"
+        print(f"\nPercent of time when GPU is busy: {gpu_busy_percent}")
+        print(f"Total wall time {wall_time_ms:.3f} ms")
+        # output such a line so we can gather such line from all compiled modules from all
+        # benchmarks and tabulate it!
+        # Columns: benchmark_name, pointwise_percent, reduction_percent, persistent_reduction_percent,
+        #   unknown_category_percent, GPU_busy_percent, wall_time_ms
+        tabulate_line = f"Output for tabulate: {benchmark_name}"
+        for category in category_list:
+            percent = (
+                f"{per_category_wall_time.get(category, 0.0) / wall_time_ms * 100:.2f}%"
+            )
+            tabulate_line += f", {percent}"
+        tabulate_line += f", {gpu_busy_percent}, {wall_time_ms:.3f}ms"
+        print(tabulate_line)
+    report()
+def compiled_module_main(benchmark_name, benchmark_compiled_module_fn):
+    """
+    This is the function called in __main__ block of a compiled module.
+    """
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--benchmark-kernels",
+        "-k",
+        action="store_true",
+        help="Whether to benchmark each individual kernels",
+    )
+    parser.add_argument(
+        "--benchmark-all-configs",
+        "-c",
+        action="store_true",
+        help="Whether to benchmark each individual config for a kernel",
+    )
+    parser.add_argument(
+        "--profile",
+        "-p",
+        action="store_true",
+        help="Whether to profile the compiled module",
+    )
+    args = parser.parse_args()
+    if args.benchmark_kernels:
+        benchmark_all_kernels(benchmark_name, args.benchmark_all_configs)
+    else:
+        times = 10
+        repeat = 10
+        wall_time_ms = benchmark_compiled_module_fn(times=times, repeat=repeat) * 1000
+        if not args.profile:
+            return
+        with torch.profiler.profile(record_shapes=True) as p:
+            benchmark_compiled_module_fn(times=times, repeat=repeat)
+        path = f"{tempfile.gettempdir()}/compiled_module_profile.json"
+        p.export_chrome_trace(path)
+        print(f"Profiling result for a compiled module of benchmark {benchmark_name}:")
+        print(f"Chrome trace for the profile is written to {path}")
+        event_list = p.key_averages(group_by_input_shape=True)
+        print(event_list.table(sort_by="self_cuda_time_total", row_limit=10))
+        parse_profile_event_list(
+            benchmark_name, event_list, wall_time_ms, times * repeat
+        )

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DimVector.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #pragma once
2	+ #include <ATen/core/DimVector.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Dimname.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <ATen/core/Dimname.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/DynamicLibrary.h ADDED Viewed

	@@ -0,0 +1,34 @@

+#pragma once
+#include <ATen/Utils.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+namespace c10 {
+class DynamicLibraryError : public Error {
+  using Error::Error;
+};
+} // namespace c10
+namespace at {
+struct DynamicLibrary {
+  AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+  TORCH_API DynamicLibrary(
+      const char* name,
+      const char* alt_name = nullptr,
+      bool leak_handle = false);
+  TORCH_API void* sym(const char* name);
+  TORCH_API ~DynamicLibrary();
+ private:
+  bool leak_handle;
+  void* handle = nullptr;
+};
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Formatting.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <ATen/core/Formatting.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MetaFunctions_inl.h ADDED Viewed

	@@ -0,0 +1,324 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_meta_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+#include <ATen/ops/_add_relu_meta_dispatch.h>
+#include <ATen/ops/_addmm_activation_meta_dispatch.h>
+#include <ATen/ops/_amp_update_scale_meta_dispatch.h>
+#include <ATen/ops/_coalesced_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_meta_dispatch.h>
+#include <ATen/ops/_ctc_loss_meta_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_meta_dispatch.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_meta_dispatch.h>
+#include <ATen/ops/_fused_sdp_choice_meta_dispatch.h>
+#include <ATen/ops/_index_put_impl_meta_dispatch.h>
+#include <ATen/ops/_linalg_det_meta_dispatch.h>
+#include <ATen/ops/_linalg_eigh_meta_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_meta_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_meta_dispatch.h>
+#include <ATen/ops/_linalg_svd_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_mkldnn_transpose_meta_dispatch.h>
+#include <ATen/ops/_reshape_alias_meta_dispatch.h>
+#include <ATen/ops/_resize_output_meta_dispatch.h>
+#include <ATen/ops/_softmax_meta_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_meta_dispatch.h>
+#include <ATen/ops/acos_meta_dispatch.h>
+#include <ATen/ops/acosh_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/add_meta_dispatch.h>
+#include <ATen/ops/addbmm_meta_dispatch.h>
+#include <ATen/ops/addcdiv_meta_dispatch.h>
+#include <ATen/ops/addcmul_meta_dispatch.h>
+#include <ATen/ops/addmm_meta_dispatch.h>
+#include <ATen/ops/addmv_meta_dispatch.h>
+#include <ATen/ops/all_meta_dispatch.h>
+#include <ATen/ops/amax_meta_dispatch.h>
+#include <ATen/ops/amin_meta_dispatch.h>
+#include <ATen/ops/aminmax_meta_dispatch.h>
+#include <ATen/ops/any_meta_dispatch.h>
+#include <ATen/ops/arange_meta_dispatch.h>
+#include <ATen/ops/argmax_meta_dispatch.h>
+#include <ATen/ops/argmin_meta_dispatch.h>
+#include <ATen/ops/as_strided_meta_dispatch.h>
+#include <ATen/ops/asin_meta_dispatch.h>
+#include <ATen/ops/asinh_meta_dispatch.h>
+#include <ATen/ops/atan_meta_dispatch.h>
+#include <ATen/ops/atan2_meta_dispatch.h>
+#include <ATen/ops/atanh_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/baddbmm_meta_dispatch.h>
+#include <ATen/ops/bernoulli_meta_dispatch.h>
+#include <ATen/ops/bitwise_and_meta_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_not_meta_dispatch.h>
+#include <ATen/ops/bitwise_or_meta_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_xor_meta_dispatch.h>
+#include <ATen/ops/bmm_meta_dispatch.h>
+#include <ATen/ops/cat_meta_dispatch.h>
+#include <ATen/ops/cauchy_meta_dispatch.h>
+#include <ATen/ops/ceil_meta_dispatch.h>
+#include <ATen/ops/clamp_meta_dispatch.h>
+#include <ATen/ops/clamp_max_meta_dispatch.h>
+#include <ATen/ops/clamp_min_meta_dispatch.h>
+#include <ATen/ops/copy_sparse_to_sparse_meta_dispatch.h>
+#include <ATen/ops/copysign_meta_dispatch.h>
+#include <ATen/ops/cos_meta_dispatch.h>
+#include <ATen/ops/cosh_meta_dispatch.h>
+#include <ATen/ops/cumprod_meta_dispatch.h>
+#include <ATen/ops/cumsum_meta_dispatch.h>
+#include <ATen/ops/digamma_meta_dispatch.h>
+#include <ATen/ops/div_meta_dispatch.h>
+#include <ATen/ops/elu_meta_dispatch.h>
+#include <ATen/ops/elu_backward_meta_dispatch.h>
+#include <ATen/ops/embedding_renorm_meta_dispatch.h>
+#include <ATen/ops/empty_meta_dispatch.h>
+#include <ATen/ops/empty_strided_meta_dispatch.h>
+#include <ATen/ops/eq_meta_dispatch.h>
+#include <ATen/ops/erf_meta_dispatch.h>
+#include <ATen/ops/erfc_meta_dispatch.h>
+#include <ATen/ops/erfinv_meta_dispatch.h>
+#include <ATen/ops/exp_meta_dispatch.h>
+#include <ATen/ops/exp2_meta_dispatch.h>
+#include <ATen/ops/expm1_meta_dispatch.h>
+#include <ATen/ops/exponential_meta_dispatch.h>
+#include <ATen/ops/eye_meta_dispatch.h>
+#include <ATen/ops/fill_meta_dispatch.h>
+#include <ATen/ops/floor_meta_dispatch.h>
+#include <ATen/ops/floor_divide_meta_dispatch.h>
+#include <ATen/ops/fmax_meta_dispatch.h>
+#include <ATen/ops/fmin_meta_dispatch.h>
+#include <ATen/ops/fmod_meta_dispatch.h>
+#include <ATen/ops/frac_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/gather_meta_dispatch.h>
+#include <ATen/ops/gcd_meta_dispatch.h>
+#include <ATen/ops/ge_meta_dispatch.h>
+#include <ATen/ops/gelu_meta_dispatch.h>
+#include <ATen/ops/gelu_backward_meta_dispatch.h>
+#include <ATen/ops/geometric_meta_dispatch.h>
+#include <ATen/ops/glu_meta_dispatch.h>
+#include <ATen/ops/gt_meta_dispatch.h>
+#include <ATen/ops/hardshrink_meta_dispatch.h>
+#include <ATen/ops/hardshrink_backward_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/hardswish_meta_dispatch.h>
+#include <ATen/ops/hardtanh_meta_dispatch.h>
+#include <ATen/ops/heaviside_meta_dispatch.h>
+#include <ATen/ops/hypot_meta_dispatch.h>
+#include <ATen/ops/i0_meta_dispatch.h>
+#include <ATen/ops/igamma_meta_dispatch.h>
+#include <ATen/ops/igammac_meta_dispatch.h>
+#include <ATen/ops/index_meta_dispatch.h>
+#include <ATen/ops/index_add_meta_dispatch.h>
+#include <ATen/ops/index_copy_meta_dispatch.h>
+#include <ATen/ops/index_fill_meta_dispatch.h>
+#include <ATen/ops/index_reduce_meta_dispatch.h>
+#include <ATen/ops/isin_meta_dispatch.h>
+#include <ATen/ops/isneginf_meta_dispatch.h>
+#include <ATen/ops/isposinf_meta_dispatch.h>
+#include <ATen/ops/lcm_meta_dispatch.h>
+#include <ATen/ops/le_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_meta_dispatch.h>
+#include <ATen/ops/lerp_meta_dispatch.h>
+#include <ATen/ops/lgamma_meta_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_cross_meta_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_qr_meta_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_meta_dispatch.h>
+#include <ATen/ops/linspace_meta_dispatch.h>
+#include <ATen/ops/log_meta_dispatch.h>
+#include <ATen/ops/log10_meta_dispatch.h>
+#include <ATen/ops/log1p_meta_dispatch.h>
+#include <ATen/ops/log2_meta_dispatch.h>
+#include <ATen/ops/log_normal_meta_dispatch.h>
+#include <ATen/ops/logaddexp_meta_dispatch.h>
+#include <ATen/ops/logaddexp2_meta_dispatch.h>
+#include <ATen/ops/logit_meta_dispatch.h>
+#include <ATen/ops/logit_backward_meta_dispatch.h>
+#include <ATen/ops/logspace_meta_dispatch.h>
+#include <ATen/ops/lshift_meta_dispatch.h>
+#include <ATen/ops/lt_meta_dispatch.h>
+#include <ATen/ops/lu_unpack_meta_dispatch.h>
+#include <ATen/ops/masked_fill_meta_dispatch.h>
+#include <ATen/ops/masked_scatter_meta_dispatch.h>
+#include <ATen/ops/max_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_meta_dispatch.h>
+#include <ATen/ops/maximum_meta_dispatch.h>
+#include <ATen/ops/mean_meta_dispatch.h>
+#include <ATen/ops/min_meta_dispatch.h>
+#include <ATen/ops/minimum_meta_dispatch.h>
+#include <ATen/ops/mish_meta_dispatch.h>
+#include <ATen/ops/mm_meta_dispatch.h>
+#include <ATen/ops/mse_loss_meta_dispatch.h>
+#include <ATen/ops/mul_meta_dispatch.h>
+#include <ATen/ops/ne_meta_dispatch.h>
+#include <ATen/ops/neg_meta_dispatch.h>
+#include <ATen/ops/nextafter_meta_dispatch.h>
+#include <ATen/ops/nll_loss_backward_meta_dispatch.h>
+#include <ATen/ops/nll_loss_forward_meta_dispatch.h>
+#include <ATen/ops/norm_meta_dispatch.h>
+#include <ATen/ops/normal_meta_dispatch.h>
+#include <ATen/ops/polygamma_meta_dispatch.h>
+#include <ATen/ops/pow_meta_dispatch.h>
+#include <ATen/ops/prod_meta_dispatch.h>
+#include <ATen/ops/put_meta_dispatch.h>
+#include <ATen/ops/random_meta_dispatch.h>
+#include <ATen/ops/range_meta_dispatch.h>
+#include <ATen/ops/reciprocal_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_meta_dispatch.h>
+#include <ATen/ops/relu_meta_dispatch.h>
+#include <ATen/ops/remainder_meta_dispatch.h>
+#include <ATen/ops/renorm_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/replication_pad2d_meta_dispatch.h>
+#include <ATen/ops/replication_pad3d_meta_dispatch.h>
+#include <ATen/ops/resize_meta_dispatch.h>
+#include <ATen/ops/resize_as_sparse_meta_dispatch.h>
+#include <ATen/ops/round_meta_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_meta_dispatch.h>
+#include <ATen/ops/rshift_meta_dispatch.h>
+#include <ATen/ops/rsqrt_meta_dispatch.h>
+#include <ATen/ops/scatter_meta_dispatch.h>
+#include <ATen/ops/scatter_add_meta_dispatch.h>
+#include <ATen/ops/scatter_reduce_meta_dispatch.h>
+#include <ATen/ops/set_meta_dispatch.h>
+#include <ATen/ops/sgn_meta_dispatch.h>
+#include <ATen/ops/sigmoid_meta_dispatch.h>
+#include <ATen/ops/sigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/sign_meta_dispatch.h>
+#include <ATen/ops/signbit_meta_dispatch.h>
+#include <ATen/ops/silu_meta_dispatch.h>
+#include <ATen/ops/silu_backward_meta_dispatch.h>
+#include <ATen/ops/sin_meta_dispatch.h>
+#include <ATen/ops/sinc_meta_dispatch.h>
+#include <ATen/ops/sinh_meta_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_meta_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_meta_dispatch.h>
+#include <ATen/ops/softplus_meta_dispatch.h>
+#include <ATen/ops/softplus_backward_meta_dispatch.h>
+#include <ATen/ops/softshrink_meta_dispatch.h>
+#include <ATen/ops/softshrink_backward_meta_dispatch.h>
+#include <ATen/ops/sort_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_and_clear_meta_dispatch.h>
+#include <ATen/ops/special_airy_ai_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j1_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y1_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_entr_meta_dispatch.h>
+#include <ATen/ops/special_erfcx_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_meta_dispatch.h>
+#include <ATen/ops/special_i0e_meta_dispatch.h>
+#include <ATen/ops/special_i1_meta_dispatch.h>
+#include <ATen/ops/special_i1e_meta_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_meta_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_meta_dispatch.h>
+#include <ATen/ops/special_log_ndtr_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_ndtri_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_xlog1py_meta_dispatch.h>
+#include <ATen/ops/special_zeta_meta_dispatch.h>
+#include <ATen/ops/sqrt_meta_dispatch.h>
+#include <ATen/ops/sub_meta_dispatch.h>
+#include <ATen/ops/sum_meta_dispatch.h>
+#include <ATen/ops/tan_meta_dispatch.h>
+#include <ATen/ops/tanh_meta_dispatch.h>
+#include <ATen/ops/tanh_backward_meta_dispatch.h>
+#include <ATen/ops/threshold_meta_dispatch.h>
+#include <ATen/ops/threshold_backward_meta_dispatch.h>
+#include <ATen/ops/topk_meta_dispatch.h>
+#include <ATen/ops/triangular_solve_meta_dispatch.h>
+#include <ATen/ops/tril_meta_dispatch.h>
+#include <ATen/ops/triu_meta_dispatch.h>
+#include <ATen/ops/trunc_meta_dispatch.h>
+#include <ATen/ops/unfold_meta_dispatch.h>
+#include <ATen/ops/uniform_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h>
+#include <ATen/ops/view_meta_dispatch.h>
+#include <ATen/ops/view_as_complex_meta_dispatch.h>
+#include <ATen/ops/view_as_real_meta_dispatch.h>
+#include <ATen/ops/xlogy_meta_dispatch.h>
+#include <ATen/ops/zero_meta_dispatch.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorSubclassLikeUtils.h ADDED Viewed

	@@ -0,0 +1,86 @@

+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/equal.h>
+#endif
+namespace at {
+// Note [Tensor-subclass-like Tensors]
+// Tensor-subclass-like is defined as:
+// - a Tensor subclass (via __torch_dispatch__ in Python or extending
+//   TensorImpl in C++)
+// - anything else that shares the same perils as Tensor subclasses.
+//   For example, many Tensor subclasses do not have storage and meta Tensors
+//   do not have storage either, so meta Tensors belong here.
+//
+// We should ensure that PyTorch internals supports Tensor-subclass-like
+// objects. In particular, Tensor-subclass-like objects struggle with two
+// classes of operations that are problematic for Tensor subclasses:
+// 1. Because some Tensor subclasses do not have storage, .item() or
+//    .data_ptr() calls are not good.
+// 2. Certain in-place operations can eliminate the typing of the Tensor
+//    subclass. For example:
+//    >>> torch.zeros(input.sizes(), grad.options()).diag().copy_(input)
+//    If input is a Tensor subclass, then the above ends up either erroring out
+//    or returning a regular non-Tensor-subclass Tensor!
+constexpr auto kFunctorchWrappedTensors = DispatchKeySet(
+    {DispatchKey::FuncTorchGradWrapper,
+     DispatchKey::FuncTorchBatched,
+     DispatchKey::Functionalize});
+constexpr auto kTensorSubclassLike =
+    kFunctorchWrappedTensors |
+    DispatchKeySet(
+        {// WARNING: DO NOT put combined backend component + functionality keys
+         // here, you will incorrectly always match on the functionality key
+         // no matter the backend component
+         DispatchKey::Batched,
+         DispatchKey::Sparse,
+         DispatchKey::SparseCsr,
+         DispatchKey::Python}) |
+    DispatchKeySet(BackendComponent::MetaBit);
+inline bool isTensorSubclassLike(const Tensor& tensor) {
+  if (c10::impl::dispatch_mode_enabled())
+    return true;
+  auto key_set = tensor.unsafeGetTensorImpl()->key_set();
+  return !(key_set & kTensorSubclassLike).empty();
+}
+inline bool areAnyTensorSubclassLike(TensorList tensors) {
+  if (c10::impl::dispatch_mode_enabled())
+    return true;
+  return std::any_of(tensors.begin(), tensors.end(), isTensorSubclassLike);
+}
+inline bool areAnyOptionalTensorSubclassLike(
+    const c10::List<c10::optional<Tensor>>& tensors) {
+  if (c10::impl::dispatch_mode_enabled())
+    return true;
+  return std::any_of(
+      tensors.begin(), tensors.end(), [](const optional<Tensor>& opt_tensor) {
+        return (
+            opt_tensor.has_value() && isTensorSubclassLike(opt_tensor.value()));
+      });
+}
+// Helper function to deal testing truthfulness of a scalar tensor
+// in a Composite Compliant manner.
+// NOTE: This function expects a scalar tensor of boolean dtype.
+// Eg.
+// Non-Composite Compliant Pattern : (t == 0).all().item<bool>()
+// Composite Compliant Patter : is_salar_tensor_true((t == 0).all())
+inline bool is_scalar_tensor_true(const Tensor& t) {
+  TORCH_INTERNAL_ASSERT(t.dim() == 0)
+  TORCH_INTERNAL_ASSERT(t.scalar_type() == kBool)
+  return at::equal(t, t.new_ones({}, t.options()));
+}
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/ATenCUDAGeneral.h ADDED Viewed

	@@ -0,0 +1,9 @@

+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Export.h>
+// Use TORCH_CUDA_CPP_API or TORCH_CUDA_CU_API for exports from this folder

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAContext.h ADDED Viewed

	@@ -0,0 +1,9 @@

+#pragma once
+#include <ATen/cuda/CUDAContextLight.h>
+// Preserved for BC, as many files depend on these includes
+#include <ATen/Context.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Logging.h>
+#include <ATen/cuda/Exceptions.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADataType.h ADDED Viewed

	@@ -0,0 +1,115 @@

+#pragma once
+#include <c10/core/ScalarType.h>
+#include <cuda.h>
+#include <library_types.h>
+namespace at::cuda {
+template <typename scalar_t>
+cudaDataType getCudaDataType() {
+  TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.")
+}
+template<> inline cudaDataType getCudaDataType<at::Half>() {
+  return CUDA_R_16F;
+}
+template<> inline cudaDataType getCudaDataType<float>() {
+  return CUDA_R_32F;
+}
+template<> inline cudaDataType getCudaDataType<double>() {
+  return CUDA_R_64F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<c10::Half>>() {
+  return CUDA_C_16F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<float>>() {
+  return CUDA_C_32F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<double>>() {
+  return CUDA_C_64F;
+}
+// HIP doesn't define integral types
+#ifndef USE_ROCM
+template<> inline cudaDataType getCudaDataType<uint8_t>() {
+  return CUDA_R_8U;
+}
+template<> inline cudaDataType getCudaDataType<int8_t>() {
+  return CUDA_R_8I;
+}
+template<> inline cudaDataType getCudaDataType<int>() {
+  return CUDA_R_32I;
+}
+#endif
+#if !defined(USE_ROCM)
+template<> inline cudaDataType getCudaDataType<int16_t>() {
+  return CUDA_R_16I;
+}
+template<> inline cudaDataType getCudaDataType<int64_t>() {
+  return CUDA_R_64I;
+}
+template<> inline cudaDataType getCudaDataType<at::BFloat16>() {
+  return CUDA_R_16BF;
+}
+#endif
+inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) {
+  switch (scalar_type) {
+// HIP doesn't define integral types
+#ifndef USE_ROCM
+    case c10::ScalarType::Byte:
+      return CUDA_R_8U;
+    case c10::ScalarType::Char:
+      return CUDA_R_8I;
+    case c10::ScalarType::Int:
+      return CUDA_R_32I;
+#endif
+    case c10::ScalarType::Half:
+      return CUDA_R_16F;
+    case c10::ScalarType::Float:
+      return CUDA_R_32F;
+    case c10::ScalarType::Double:
+      return CUDA_R_64F;
+    case c10::ScalarType::ComplexHalf:
+      return CUDA_C_16F;
+    case c10::ScalarType::ComplexFloat:
+      return CUDA_C_32F;
+    case c10::ScalarType::ComplexDouble:
+      return CUDA_C_64F;
+#if !defined(USE_ROCM)
+    case c10::ScalarType::Short:
+      return CUDA_R_16I;
+    case c10::ScalarType::Long:
+      return CUDA_R_64I;
+    case c10::ScalarType::BFloat16:
+      return CUDA_R_16BF;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    case c10::ScalarType::Float8_e4m3fn:
+      return CUDA_R_8F_E4M3;
+    case c10::ScalarType::Float8_e5m2:
+      return CUDA_R_8F_E5M2;
+#endif
+#else // USE_ROCM
+    case c10::ScalarType::BFloat16:
+      return CUDA_R_16BF;
+#if defined(HIP_NEW_TYPE_ENUMS)
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return HIP_R_8F_E4M3_FNUZ;
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return HIP_R_8F_E5M2_FNUZ;
+#else
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return static_cast<hipDataType>(1000);
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return static_cast<hipDataType>(1001);
+#endif
+#endif
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
+  }
+}
+} // namespace at::cuda

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Exceptions.h ADDED Viewed

	@@ -0,0 +1,174 @@

+#pragma once
+#include <cublas_v2.h>
+#include <cusparse.h>
+#include <c10/macros/Export.h>
+#ifdef CUDART_VERSION
+#include <cusolver_common.h>
+#endif
+#include <ATen/Context.h>
+#include <c10/util/Exception.h>
+#include <c10/cuda/CUDAException.h>
+namespace c10 {
+class CuDNNError : public c10::Error {
+  using Error::Error;
+};
+}  // namespace c10
+#define AT_CUDNN_FRONTEND_CHECK(EXPR, ...)                                                      \
+  do {                                                                                          \
+    auto error_object = EXPR;                                                                   \
+    if (!error_object.is_good()) {                                                              \
+      TORCH_CHECK_WITH(CuDNNError, false,                                                       \
+            "cuDNN Frontend error: ", error_object.get_message());                              \
+    }                                                                                           \
+  } while (0)                                                                                   \
+#define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__)
+// See Note [CHECK macro]
+#define AT_CUDNN_CHECK(EXPR, ...)                                                               \
+  do {                                                                                          \
+    cudnnStatus_t status = EXPR;                                                                \
+    if (status != CUDNN_STATUS_SUCCESS) {                                                       \
+      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                               \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ",                                                                    \
+            cudnnGetErrorString(status),                                                        \
+            ". This error may appear if you passed in a non-contiguous input.", ##__VA_ARGS__); \
+      } else {                                                                                  \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ", cudnnGetErrorString(status), ##__VA_ARGS__);                       \
+      }                                                                                         \
+    }                                                                                           \
+  } while (0)
+namespace at::cuda::blas {
+C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error);
+} // namespace at::cuda::blas
+#define TORCH_CUDABLAS_CHECK(EXPR)                              \
+  do {                                                          \
+    cublasStatus_t __err = EXPR;                                \
+    TORCH_CHECK(__err == CUBLAS_STATUS_SUCCESS,                 \
+                "CUDA error: ",                                 \
+                at::cuda::blas::_cublasGetErrorEnum(__err),     \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+const char *cusparseGetErrorString(cusparseStatus_t status);
+#define TORCH_CUDASPARSE_CHECK(EXPR)                            \
+  do {                                                          \
+    cusparseStatus_t __err = EXPR;                              \
+    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
+                "CUDA error: ",                                 \
+                cusparseGetErrorString(__err),                  \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+// cusolver related headers are only supported on cuda now
+#ifdef CUDART_VERSION
+namespace at::cuda::solver {
+C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status);
+constexpr const char* _cusolver_backend_suggestion =            \
+  "If you keep seeing this error, you may use "                 \
+  "`torch.backends.cuda.preferred_linalg_library()` to try "    \
+  "linear algebra operators with other supported backends. "    \
+  "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
+} // namespace at::cuda::solver
+// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
+// When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
+#define TORCH_CUSOLVER_CHECK(EXPR)                                      \
+  do {                                                                  \
+    cusolverStatus_t __err = EXPR;                                      \
+    if ((CUDA_VERSION < 11500 &&                                        \
+         __err == CUSOLVER_STATUS_EXECUTION_FAILED) ||                  \
+        (CUDA_VERSION >= 11500 &&                                       \
+         __err == CUSOLVER_STATUS_INVALID_VALUE)) {                     \
+      TORCH_CHECK_LINALG(                                               \
+          false,                                                        \
+          "cusolver error: ",                                           \
+          at::cuda::solver::cusolverGetErrorMessage(__err),             \
+          ", when calling `" #EXPR "`",                                 \
+          ". This error may appear if the input matrix contains NaN. ", \
+          at::cuda::solver::_cusolver_backend_suggestion);              \
+    } else {                                                            \
+      TORCH_CHECK(                                                      \
+          __err == CUSOLVER_STATUS_SUCCESS,                             \
+          "cusolver error: ",                                           \
+          at::cuda::solver::cusolverGetErrorMessage(__err),             \
+          ", when calling `" #EXPR "`. ",                               \
+          at::cuda::solver::_cusolver_backend_suggestion);              \
+    }                                                                   \
+  } while (0)
+#else
+#define TORCH_CUSOLVER_CHECK(EXPR) EXPR
+#endif
+#define AT_CUDA_CHECK(EXPR) C10_CUDA_CHECK(EXPR)
+// For CUDA Driver API
+//
+// This is here instead of in c10 because NVRTC is loaded dynamically via a stub
+// in ATen, and we need to use its nvrtcGetErrorString.
+// See NOTE [ USE OF NVRTC AND DRIVER API ].
+#if !defined(USE_ROCM)
+#define AT_CUDA_DRIVER_CHECK(EXPR)                                                                               \
+  do {                                                                                                           \
+    CUresult __err = EXPR;                                                                                       \
+    if (__err != CUDA_SUCCESS) {                                                                                 \
+      const char* err_str;                                                                                       \
+      CUresult get_error_str_err C10_UNUSED = at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str);  \
+      if (get_error_str_err != CUDA_SUCCESS) {                                                                   \
+        AT_ERROR("CUDA driver error: unknown error");                                                            \
+      } else {                                                                                                   \
+        AT_ERROR("CUDA driver error: ", err_str);                                                                \
+      }                                                                                                          \
+    }                                                                                                            \
+  } while (0)
+#else
+#define AT_CUDA_DRIVER_CHECK(EXPR)                                                \
+  do {                                                                            \
+    CUresult __err = EXPR;                                                        \
+    if (__err != CUDA_SUCCESS) {                                                  \
+      AT_ERROR("CUDA driver error: ", static_cast<int>(__err));                   \
+    }                                                                             \
+  } while (0)
+#endif
+// For CUDA NVRTC
+//
+// Note: As of CUDA 10, nvrtc error code 7, NVRTC_ERROR_BUILTIN_OPERATION_FAILURE,
+// incorrectly produces the error string "NVRTC unknown error."
+// The following maps it correctly.
+//
+// This is here instead of in c10 because NVRTC is loaded dynamically via a stub
+// in ATen, and we need to use its nvrtcGetErrorString.
+// See NOTE [ USE OF NVRTC AND DRIVER API ].
+#define AT_CUDA_NVRTC_CHECK(EXPR)                                                                   \
+  do {                                                                                              \
+    nvrtcResult __err = EXPR;                                                                       \
+    if (__err != NVRTC_SUCCESS) {                                                                   \
+      if (static_cast<int>(__err) != 7) {                                                           \
+        AT_ERROR("CUDA NVRTC error: ", at::globalContext().getNVRTC().nvrtcGetErrorString(__err));  \
+      } else {                                                                                      \
+        AT_ERROR("CUDA NVRTC error: NVRTC_ERROR_BUILTIN_OPERATION_FAILURE");                        \
+      }                                                                                             \
+    }                                                                                               \
+  } while (0)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h ADDED Viewed

	@@ -0,0 +1,85 @@

+//  Copyright © 2022 Apple Inc.
+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+typedef void* MTLLibrary_t;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary_t;
+#endif
+using namespace std;
+namespace at::mps {
+// Helper enum to check if a MPSGraph op is supported in a given macOS version
+enum class MacOSVersion : uint32_t {
+  MACOS_VER_13_0_PLUS = 0,
+  MACOS_VER_13_1_PLUS,
+  MACOS_VER_13_2_PLUS,
+  MACOS_VER_13_3_PLUS,
+  MACOS_VER_14_0_PLUS,
+};
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() {
+    return _mtl_device;
+  }
+  /**
+   * Returns whether running on Ventura or newer
+   */
+  bool isMacOS13Plus(MacOSVersion version) const;
+  MTLComputePipelineState_t metalIndexingPSO(const std::string &kernel);
+  MTLLibrary_t getMetalIndexingLibrary();
+  ~MPSDevice();
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MTLLibrary_t _mtl_indexing_library;
+  MPSDevice();
+};
+TORCH_API bool is_available();
+TORCH_API bool is_macos_13_or_newer(MacOSVersion version = MacOSVersion::MACOS_VER_13_0_PLUS);
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+} // namespace at::mps

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+class Tensor;
+namespace native {
+using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)(
+    TensorList,
+    Tensor&,
+    const Tensor&);
+using _amp_update_scale_cpu__fn = Tensor& (*)(
+    Tensor&,
+    Tensor&,
+    const Tensor&,
+    double,
+    double,
+    int64_t);
+DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h ADDED Viewed

	@@ -0,0 +1,189 @@

+#pragma once
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TransposeType.h>
+#include <c10/util/complex.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Scalar.h>
+namespace at::native::cpublas {
+namespace internal {
+void normalize_last_dims(
+  TransposeType transa, TransposeType transb,
+  int64_t m, int64_t n, int64_t k,
+  int64_t *lda, int64_t *ldb, int64_t *ldc);
+}  // namespace internal
+using gemm_fn = void(*)(
+    at::ScalarType type,
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const Scalar& alpha,
+    const void *a, int64_t lda,
+    const void *b, int64_t ldb,
+    const Scalar& beta,
+    void *c, int64_t ldc);
+DECLARE_DISPATCH(gemm_fn, gemm_stub);
+template <typename scalar_t>
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    at::opmath_type<scalar_t> alpha,
+    const scalar_t *a, int64_t lda,
+    const scalar_t *b, int64_t ldb,
+    at::opmath_type<scalar_t> beta,
+    scalar_t *c, int64_t ldc) {
+  internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  gemm_stub(
+    kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+    transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    double alpha,
+    const double *a, int64_t lda,
+    const double *b, int64_t ldb,
+    double beta,
+    double *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    float beta,
+    at::BFloat16 *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    float beta,
+    at::Half *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<double> alpha,
+    const c10::complex<double> *a, int64_t lda,
+    const c10::complex<double> *b, int64_t ldb,
+    c10::complex<double> beta,
+    c10::complex<double> *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<float> alpha,
+    const c10::complex<float> *a, int64_t lda,
+    const c10::complex<float> *b, int64_t ldb,
+    c10::complex<float> beta,
+    c10::complex<float> *c, int64_t ldc);
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    int64_t alpha,
+    const int64_t *a, int64_t lda,
+    const int64_t *b, int64_t ldb,
+    int64_t beta,
+    int64_t *c, int64_t ldc);
+template <typename scalar_t>
+void gemm_batched(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t * const *a, int64_t lda,
+    const scalar_t * const *b, int64_t ldb,
+    const scalar_t beta,
+    scalar_t * const *c, int64_t ldc);
+template <typename scalar_t>
+void gemm_batched_with_stride(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t *a, int64_t lda, int64_t batch_stride_a,
+    const scalar_t *b, int64_t ldb, int64_t batch_stride_b,
+    scalar_t beta,
+    scalar_t *c, int64_t ldc, int64_t batch_stride_c);
+using axpy_fn = void(*)(at::ScalarType type, int64_t n, const Scalar& a, const void *x, int64_t incx, void *y, int64_t incy);
+DECLARE_DISPATCH(axpy_fn, axpy_stub);
+template<typename scalar_t>
+void axpy(int64_t n, scalar_t a, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy){
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  axpy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, a, x, incx, y, incy);
+}
+void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t incy);
+void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t incy);
+void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+using copy_fn = void(*)(at::ScalarType type, int64_t n, const void *x, int64_t incx, void *y, int64_t incy);
+DECLARE_DISPATCH(copy_fn, copy_stub);
+template<typename scalar_t>
+void copy(int64_t n, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) {
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  copy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, x, incx, y, incy);
+}
+void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy);
+void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
+void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+}  // namespace at::native::cpublas

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h ADDED Viewed

	@@ -0,0 +1,263 @@

+#include <utility>
+#pragma once
+namespace at::native {
+namespace {
+// operator_brackets_proxy is used in
+// CompositeRandomAccessor in place of operator[].
+// For some iterators, references returned by operator[]
+// could become invalid, operator_brackets_proxy tries to
+// resolve that by making accessor[n] to be equivalent to
+// *(accessor + n).
+template <typename Accessor>
+class operator_brackets_proxy {
+  using reference = typename std::iterator_traits<Accessor>::reference;
+  using value_type = typename std::iterator_traits<Accessor>::value_type;
+public:
+  C10_HOST_DEVICE
+  operator_brackets_proxy(Accessor const& accessor)
+    : accessor(accessor)
+  {}
+  C10_HOST_DEVICE
+  operator reference() {
+    return *accessor;
+  }
+  C10_HOST_DEVICE
+  reference operator*() {
+    return *accessor;
+  }
+  C10_HOST_DEVICE
+  operator_brackets_proxy& operator=(value_type const& val) {
+    *accessor = val;
+    return *this;
+  }
+private:
+  Accessor accessor;
+};
+}
+// references_holder is used as a surrogate for the
+// references type from std::iterator_traits in CompositeRandomAccessor.
+// It is assumed in CompositeRandomAccessor that
+// References = tuple<Types&...>,
+// Values = tuple<Types...> by default,
+// but they could be anything as long as References could be
+// cast to Values.
+// If you plan to use it with STL, for example, you will need to
+// define 'swap` and `get`(aka std::get) methods.
+template <typename Values, typename References>
+class references_holder {
+public:
+  using values = Values;
+  using references = References;
+  C10_HOST_DEVICE
+  references_holder(references refs)
+    : refs{std::move(refs)}
+  {}
+  C10_HOST_DEVICE
+  operator references() {
+    return refs;
+  }
+  C10_HOST_DEVICE
+  operator values() {
+    return refs;
+  }
+  C10_HOST_DEVICE
+  references_holder& operator=(values vals) {
+    refs = vals;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  references& data() {
+    return refs;
+  }
+protected:
+  references refs;
+};
+// CompositeRandomAccessor is essentially a simplified version of
+// a random access iterator over two random access iterators.
+// TupleInfo should contain a variadic type `tuple`, and a method `tie`,
+// which constructs a tuple of references from a variadic list of arguments.
+template <typename KeyAccessor, typename ValueAccessor, typename TupleInfo>
+class CompositeRandomAccessor {
+  using self_type = CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfo>;
+  using key_accessor_value_type =
+    typename std::iterator_traits<KeyAccessor>::value_type;
+  using value_accessor_value_type =
+    typename std::iterator_traits<ValueAccessor>::value_type;
+  using key_accessor_reference_type =
+    typename std::iterator_traits<KeyAccessor>::reference;
+  using value_accessor_reference_type =
+    typename std::iterator_traits<ValueAccessor>::reference;
+  using composite_value_type = typename TupleInfo::template tuple<
+    key_accessor_value_type,
+    value_accessor_value_type>;
+  using composite_reference = typename TupleInfo::template tuple<
+    key_accessor_reference_type,
+    value_accessor_reference_type>;
+public:
+  using value_type = composite_value_type;
+  using reference = references_holder<composite_value_type, composite_reference>;
+  // Note that CompositeRandomAccessor does not hold key and values
+  // in a specific datastructure, which means that a pointer to a (key, value)
+  // is not defined. Hence we just use a pointer type of the KeyAccessor.
+  using pointer = typename std::iterator_traits<KeyAccessor>::pointer;
+  using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
+  using iterator_category = std::random_access_iterator_tag;
+  C10_HOST_DEVICE
+  CompositeRandomAccessor() = default;
+  C10_HOST_DEVICE
+  CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values)
+    : keys(keys), values(values)
+  {}
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return TupleInfo::tie(*keys, *values);
+  }
+  // operator->() is supposed to return a pointer type.
+  // Since CompositeRandomAccessor does not hold pointers to pairs,
+  // we just return a pointer to a key.
+  C10_HOST_DEVICE
+  auto* operator->() const {
+    return keys.operator->();
+  }
+  C10_HOST_DEVICE
+  reference operator[](difference_type idx) {
+    return operator_brackets_proxy<self_type>(
+      CompositeRandomAccessor(keys + idx, values + idx)
+    );
+  }
+  // }
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator++() {
+    ++keys;
+    ++values;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator++(int) {
+    CompositeRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator--() {
+    --keys;
+    --values;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator--(int) {
+    CompositeRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator+=(difference_type offset) {
+    keys += offset;
+    values += offset;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator+(difference_type offset) const {
+    return CompositeRandomAccessor(keys + offset, values + offset);
+  }
+  C10_HOST_DEVICE
+  friend CompositeRandomAccessor operator+(
+    difference_type offset,
+    const CompositeRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator-=(difference_type offset) {
+    keys -= offset;
+    values -= offset;
+    return *this;
+  }
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator-(difference_type offset) const {
+    return CompositeRandomAccessor(keys - offset, values - offset);
+  }
+  C10_HOST_DEVICE
+  difference_type operator-(const CompositeRandomAccessor& other) const {
+    return keys - other.keys;
+  }
+  // }
+  // Comparison operators {
+  C10_HOST_DEVICE
+  bool operator==(const CompositeRandomAccessor& other) const {
+    return keys == other.keys;
+  }
+  C10_HOST_DEVICE
+  bool operator!=(const CompositeRandomAccessor& other) const {
+    return keys != other.keys;
+  }
+  C10_HOST_DEVICE
+  bool operator<(const CompositeRandomAccessor& other) const {
+    return keys < other.keys;
+  }
+  C10_HOST_DEVICE
+  bool operator<=(const CompositeRandomAccessor& other) const {
+    return keys <= other.keys;
+  }
+  C10_HOST_DEVICE
+  bool operator>(const CompositeRandomAccessor& other) const {
+    return keys > other.keys;
+  }
+  C10_HOST_DEVICE
+  bool operator>=(const CompositeRandomAccessor& other) const {
+    return keys >= other.keys;
+  }
+  // }
+protected:
+  KeyAccessor keys;
+  ValueAccessor values;
+};
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <ATen/core/Tensor.h>
+namespace at::native {
+std::tuple<Tensor, Tensor, Tensor> slow_conv3d_backward_cpu(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    std::array<bool, 3> output_mask);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace at {
+class Tensor;
+struct TensorIterator;
+class TensorBase;
+namespace native {
+using copy_fn = void (*)(TensorIterator&, bool non_blocking);
+DECLARE_DISPATCH(copy_fn, copy_stub);
+TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h ADDED Viewed

	@@ -0,0 +1,229 @@

+#pragma once
+#include <algorithm>
+#include <vector>
+#include <ATen/div_rtn.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
+  TORCH_CHECK(                                       \
+      T.dim() == DIM && T.size(DIM_SIZE) == SIZE,    \
+      "Need " #T " of dimension ",                   \
+      DIM,                                           \
+      " and " #T ".size[",                           \
+      DIM_SIZE,                                      \
+      "] == ",                                       \
+      SIZE,                                          \
+      " but got input to be of shape ",              \
+      T.sizes())
+namespace at::native::internal {
+namespace {
+inline bool all_positive(IntArrayRef& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item > 0; });
+}
+inline bool all_nonnegative(std::vector<int64_t>& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item >= 0; });
+}
+} // namespace
+// calculate the rear part of output tensor sizes
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  std::vector<int64_t> sizes;
+  for (const auto index : c10::irange(dim)) {
+    sizes.push_back(
+        div_rtn<int64_t>(
+            input.size(index + input.dim() - dim) + 2 * pad_size[index] -
+                (dilation_size[index] * (kernel_size[index] - 1) + 1),
+            stride_size[index]) +
+        1);
+  }
+  return sizes;
+}
+// calculate the sizes of output tensor
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+  output_size.insert(output_size.begin(), weight.size(0));
+  if (input.dim() == dim + 2) {
+    output_size.insert(output_size.begin(), input.size(0));
+  }
+  return output_size;
+}
+/*
+  slow_conv_dilated_shape_check - check user-input to dilated convolution
+  forward and backward functions.
+*/
+template <int64_t dim>
+void slow_conv_dilated_shape_check(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  /*
+    When the following tensors are defined:
+    bias, grad_weight, grad_output
+    then these are assumed to be contiguous without checking
+    because of these tensors are made contiguous by calling
+    .contiguous() method or by resizing of zero-sized tensors in
+    forward/backward functions.
+    When grad_weight is defined then it is assumed without
+    checking to have the same shape as weight, see backward
+    functions.
+   */
+  // Check size arguments
+  TORCH_CHECK(
+      kernel_size.size() == dim,
+      "kernel sizes length should be ",
+      dim,
+      ", but got ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride_size.size() == dim,
+      "strides length should be ",
+      dim,
+      ", but got ",
+      stride_size.size());
+  TORCH_CHECK(
+      dilation_size.size() == dim,
+      "dilations length should be ",
+      dim,
+      ", but got ",
+      dilation_size.size());
+  TORCH_CHECK(
+      pad_size.size() == dim,
+      "pads length should be ",
+      dim,
+      ", but got ",
+      pad_size.size());
+  TORCH_CHECK(
+      all_positive(kernel_size),
+      "kernel size should be greater than zero, but got ",
+      kernel_size);
+  TORCH_CHECK(
+      all_positive(stride_size),
+      "stride should be greater than zero, but got ",
+      stride_size);
+  TORCH_CHECK(
+      all_positive(dilation_size),
+      "dilation should be greater than zero, but got ",
+      dilation_size);
+  // check input
+  TORCH_CHECK(input.defined(), "input must be defined");
+  bool is_batch = input.dim() == dim + 2;
+  int64_t n = (is_batch ? 2 : 1);
+  int64_t ndim = n + dim;
+  if (!is_batch) {
+    // input dim has to be dim + 1 if not batched
+    TORCH_CHECK(
+        input.dim() == dim + 1,
+        "input must be 4D or 5D tensor but got ",
+        input.dim(),
+        "D tensor");
+  }
+  // check output sizes
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+  TORCH_CHECK(
+      all_nonnegative(output_size),
+      "calculated output size ",
+      output_size,
+      " is too small (all sizes must be non-negative)");
+  // check weight
+  TORCH_CHECK(weight.defined(), "weight must be defined");
+  TORCH_CHECK(
+      weight.dim() == dim + 2,
+      "weight must be ",
+      dim + 2,
+      "D tensor but got ",
+      weight.dim(),
+      "D tensor dim=",
+      dim);
+  TORCH_CHECK(
+      weight.sizes().slice(2) == kernel_size,
+      "weight[2:] shape ",
+      weight.sizes().slice(2),
+      " must be equal to kernel_size ",
+      kernel_size);
+  TORCH_CHECK_DIM_SIZE(input, input.dim(), (is_batch ? 1 : 0), weight.size(1));
+  // check bias when present
+  if (bias.defined()) {
+    TORCH_CHECK(
+        bias.dim() == 1,
+        "bias must be 1D tensor but got ",
+        bias.dim(),
+        "D tensor");
+    TORCH_CHECK_DIM_SIZE(bias, 1, 0, weight.size(0));
+  }
+  // check grad_output when present
+  if (grad_output.defined()) {
+    TORCH_CHECK(
+        grad_output.dim() == ndim,
+        "grad_output must be ",
+        ndim,
+        "D tensor but got ",
+        grad_output.dim(),
+        "D tensor");
+    if (is_batch) {
+      TORCH_CHECK(
+          grad_output.size(0) == input.size(0),
+          "grad_output.size(0)=",
+          grad_output.size(0),
+          " must be input.size(0)=",
+          input.size(0));
+    }
+    TORCH_CHECK(
+        grad_output.size(n - 1) == weight.size(0),
+        "grad_output.size(",
+        n - 1,
+        ")=",
+        grad_output.size(n - 1),
+        " must be weight.size(0)=",
+        weight.size(0));
+    TORCH_CHECK(
+        grad_output.sizes().slice(n) == output_size,
+        "grad_output[",
+        n,
+        ":] shape",
+        grad_output.sizes().slice(n),
+        " must be equal to output size ",
+        output_size);
+  }
+}
+} // namespace at::native::internal

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h ADDED Viewed

	@@ -0,0 +1,371 @@

+#pragma once
+#include <ATen/Device.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ScalarType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type_native.h>
+#endif
+#include <unordered_map>
+#include <vector>
+namespace at::native {
+namespace {
+// Check if tensor list has either a boolean tensor or a integer tensor
+inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
+  return std::any_of(
+      tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
+        return at::isIntegralType(t.scalar_type(), includeBool);
+      });
+}
+// check if tensor list has bool tensors
+inline bool has_bool_tensor(TensorList tensors) {
+  return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool {
+    return t.scalar_type() == ScalarType::Bool;
+  });
+}
+// Check foreach API restrictions
+// - Tensor lists must be non-empty.
+// - All TensorLists and ScalarLists must have the same number of elements.
+// - Corresponding tensors must have the same size.
+inline void check_foreach_api_restrictions(TensorList tensors) {
+  TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
+}
+inline void check_foreach_api_restrictions(
+    TensorList tensors,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors);
+  TORCH_CHECK(
+      tensors.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list.");
+}
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+}
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+  TORCH_CHECK(
+      tensors1.size() == tensors3.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors3.size());
+}
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  TORCH_CHECK(
+      tensors1.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list, got ",
+      tensors1.size(),
+      " and ",
+      scalars.size());
+}
+// Helper function called in check_fast_path_restrictions to check whether all
+// corresponding tensors (aligning in index across the tensorLists) share the
+// same device and dtype.
+inline bool _check_tensors_share_device_and_dtype(
+    ArrayRef<TensorList> tensorLists) {
+  const auto expected_dtype = tensorLists[0][0].dtype();
+  const auto expected_device = tensorLists[0][0].device();
+  auto is_tensor_okay = [&](const Tensor& tensor) {
+    return tensor.dtype() == expected_dtype &&
+        tensor.device() == expected_device && tensor.layout() == at::kStrided &&
+        tensor.is_non_overlapping_and_dense();
+  };
+  for (const auto& tensorList : tensorLists) {
+    for (const auto& tensor : tensorList) {
+      if (!is_tensor_okay(tensor)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+// Helper function called in check_fast_path_restrictions to check if
+// corresponding tensors in tensor lists have the same sizes and strides.
+inline bool _check_tensors_share_sizes_and_strides(
+    ArrayRef<TensorList> tensorLists) {
+  for (const auto i : c10::irange(1, tensorLists.size())) {
+    for (const auto j : c10::irange(tensorLists[0].size())) {
+      if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
+          tensorLists[0][j].strides() != tensorLists[i][j].strides()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+// Helper function called in check_fast_path_restrictions to check whether
+// all tensors type promote properly with the scalars in scalarList. This
+// function assumes that _check_tensors_share_device_and_dtype has already been
+// called so that all corresponding tensors in tensorLists have the same dtype.
+// Then, it is sufficient to check the type promotion with just one tensorList.
+inline bool _check_tensors_do_type_promotion_with_scalars(
+    TensorList tensorList,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  for (const auto i : c10::irange(tensorList.size())) {
+    // For division, integer inputs will result in float.
+    if (does_op_promote_integer_inputs_to_float) {
+      if (at::isIntegralType(
+              tensorList[i].scalar_type(), /*includeBool*/ true)) {
+        return false;
+      }
+    }
+    if (!scalarList.empty()) {
+      const auto& scalar =
+          scalarList.size() == 1 ? scalarList[0] : scalarList[i];
+      const auto& tensor = tensorList[i];
+      // note(mkozuki): This check might be responsible for
+      // `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path.
+      if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+// To go via 'fast' path, several conditions must be satisfied
+// - All tensors in all lists must have the same dtype.
+// - All tensors must be on the same device
+// - All tensors must have strided layout
+// - All tensors must be non-overlapping and dense
+// - Resulting tensor must have the same dtype as the input one
+// Please, make sure to call check_foreach_api_restrictions before calling this
+// method. There is a set of preconditions that have to be satisfied.
+inline bool check_fast_path_restrictions(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return _check_tensors_share_device_and_dtype(tensorLists) &&
+      _check_tensors_share_sizes_and_strides(tensorLists) &&
+      _check_tensors_do_type_promotion_with_scalars(
+             tensorLists[0],
+             scalarList,
+             does_op_promote_integer_inputs_to_float);
+}
+inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
+    const Tensor& scalarList_,
+    int64_t expect_length) {
+  std::vector<c10::Scalar> scalarList;
+  TORCH_CHECK(
+      scalarList_.device() == c10::kCPU,
+      "Expected scalars to be on CPU, got ",
+      scalarList_.device(),
+      " instead.");
+  TORCH_CHECK(
+      scalarList_.is_contiguous(), "Expected scalars to be contiguous.");
+  TORCH_CHECK(
+      scalarList_.dim() == 1,
+      "Expected packed scalar Tensor to be of dimension 1. Got ",
+      scalarList_.dim(),
+      " instead.");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16,
+      scalarList_.scalar_type(),
+      "convert_tensor_to_scalar_list",
+      [&]() {
+        const scalar_t* scalar_data = scalarList_.data_ptr<scalar_t>();
+        TORCH_CHECK(
+            (expect_length == scalarList_.size(0)),
+            "Expected length of scalars to match input of length ",
+            expect_length,
+            " but got ",
+            scalarList_.size(0),
+            " instead.");
+        for (int64_t i = 0; i < scalarList_.size(0); i++) {
+          scalarList.emplace_back(scalar_data[i]);
+        }
+      });
+  return scalarList;
+}
+inline bool can_use_fast_route(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return check_fast_path_restrictions(
+      tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
+}
+inline bool can_use_fast_route(
+    TensorList tensors1,
+    TensorList tensors2,
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return can_use_fast_route(
+      {tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
+}
+using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
+using IndicesT = std::vector<size_t>;
+using nested_optional_tensorvec_t =
+    std::vector<std::vector<c10::optional<at::Tensor>>>;
+using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
+using FlatMap = std::unordered_map<
+    DeviceDtypeKey,
+    TensorsAndIndicesT,
+    ParamsHash<DeviceDtypeKey>>;
+inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
+    const nested_optional_tensorvec_t& nested_tensorlist,
+    const bool with_indices) {
+  FlatMap grouped_tensors_with_indices;
+  TORCH_CHECK(!nested_tensorlist.empty());
+  TORCH_CHECK(!nested_tensorlist[0].empty());
+  const auto num_lists = nested_tensorlist.size();
+  const auto num_tensors = nested_tensorlist[0].size();
+  TORCH_CHECK(std::all_of(
+      nested_tensorlist.cbegin(),
+      nested_tensorlist.cend(),
+      [&](const auto& tensorlist) -> bool {
+        // note(crcrpar): Allow empty tensorlists following
+        // ref:
+        // https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24
+        return tensorlist.size() == num_tensors || tensorlist.size() == 0;
+      }));
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    const auto key = [&]() -> DeviceDtypeKey {
+      const auto t = nested_tensorlist[0][tensor_index];
+      TORCH_CHECK(
+          t.has_value(),
+          "Tensors of the first list of nested Tensor lists are supposed to be defined but ",
+          "the ",
+          tensor_index,
+          "-th Tensor is not.");
+      return {t->device(), t->scalar_type()};
+    }();
+    TORCH_CHECK(
+        std::all_of(
+            nested_tensorlist.cbegin(),
+            nested_tensorlist.cend(),
+            [&](const auto& tensorlist) -> bool {
+              if (tensorlist.size() == 0) {
+                return true;
+              }
+              const auto& tensor = tensorlist[tensor_index];
+              // note(crcrpar): Currently the scope of this function is
+              // optimizers so there could be `state_steps` and other scalars
+              // whose elements are float tensors no matter what the parameter's
+              // dtype is.
+              if (!tensor.has_value()) {
+                return true;
+              } else {
+                const auto s = tensor->scalar_type();
+                const auto d = tensor->device();
+                // Note: `step` or `state_step` is float32 by default.
+                if (key.first == d) {
+                  return key.second == s || s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else if (d.is_cpu()) {
+                  // note(crcrpar): There are some test cases (e.g.
+                  // TestOptim::test_adam) where state_steps are on CPU and the
+                  // others are on CUDA. Currently a state_step Tensor has the
+                  // dtype of float.
+                  return s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else {
+                  return false;
+                }
+              }
+            }),
+        "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
+    if (!grouped_tensors_with_indices.count(key)) {
+      grouped_tensors_with_indices.insert(
+          {key,
+           TensorsAndIndicesT{
+               [&]() -> nested_optional_tensorvec_t {
+                 nested_optional_tensorvec_t nested_tensorvec;
+                 nested_tensorvec.reserve(num_lists);
+                 for (const auto& i : c10::irange(num_lists)) {
+                   std::vector<c10::optional<at::Tensor>> tensors;
+                   if (!nested_tensorlist[i].empty()) {
+                     // NB: num_tensors is the max possible length for any of
+                     // the inner lists of tensor references. Reserving the max
+                     // trades memory for perf. This should not have significant
+                     // impact.
+                     tensors.reserve(num_tensors);
+                   }
+                   nested_tensorvec.emplace_back(tensors);
+                 }
+                 return nested_tensorvec;
+               }(),
+               [&]() -> IndicesT {
+                 if (!with_indices) {
+                   return {};
+                 } else {
+                   IndicesT indices;
+                   indices.reserve(num_tensors);
+                   return indices;
+                 }
+               }()}});
+    }
+    for (const auto& list_index : c10::irange(num_lists)) {
+      if (!nested_tensorlist[list_index].empty()) {
+        grouped_tensors_with_indices[key].first[list_index].emplace_back(
+            nested_tensorlist[list_index][tensor_index]);
+      }
+    }
+    if (with_indices) {
+      grouped_tensors_with_indices[key].second.emplace_back(tensor_index);
+    }
+  }
+  return grouped_tensors_with_indices;
+}
+} // namespace
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/Optional.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+}
+namespace at::native {
+using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha);
+DECLARE_DISPATCH(addr_fn, addr_stub);
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <ATen/NumericUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+namespace at::native {
+// ensure we get good values and indices for kthvalue, mode
+// this will always be with the reducing dim as 1-d
+inline void _reduction_with_indices_allocate_or_resize_output(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = 1;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    if (!keepdim && values.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      values.unsqueeze_(dim);
+    }
+    resize_output(values, result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    if (!keepdim && indices.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      indices.unsqueeze_(dim);
+    }
+    resize_output(indices, result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+// ensure we get good values and indices for topk
+inline void _allocate_or_resize_output_with_indices(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    int64_t k) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = k;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    values.resize_(result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    indices.resize_(result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h ADDED Viewed

	@@ -0,0 +1,130 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <ATen/Generator.h>
+#include <c10/core/Scalar.h>
+#include <stdexcept>
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIteratorBase;
+}
+namespace at::native {
+using unary_fn = void(*)(TensorIteratorBase&);
+using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a);
+inline namespace CPU_CAPABILITY {
+void conj_kernel(TensorIteratorBase &iter);
+void neg_kernel(TensorIteratorBase &iter);
+void reciprocal_kernel(TensorIteratorBase &iter);
+void rsqrt_kernel(TensorIteratorBase& iter);
+void sqrt_kernel(TensorIteratorBase& iter);
+} // namespace CPU_CAPABILITY
+DECLARE_DISPATCH(unary_fn, abs_stub);
+DECLARE_DISPATCH(unary_fn, angle_stub);
+DECLARE_DISPATCH(unary_fn, conj_physical_stub);
+DECLARE_DISPATCH(unary_fn, acos_stub);
+DECLARE_DISPATCH(unary_fn, acosh_stub);
+DECLARE_DISPATCH(unary_fn, asinh_stub);
+DECLARE_DISPATCH(unary_fn, atanh_stub);
+DECLARE_DISPATCH(unary_fn, asin_stub);
+DECLARE_DISPATCH(unary_fn, atan_stub);
+DECLARE_DISPATCH(unary_fn, bitwise_not_stub);
+DECLARE_DISPATCH(unary_fn, logical_not_stub);
+DECLARE_DISPATCH(unary_fn, ceil_stub);
+DECLARE_DISPATCH(unary_fn, cos_stub);
+DECLARE_DISPATCH(unary_fn, cosh_stub);
+DECLARE_DISPATCH(unary_fn, digamma_stub);
+DECLARE_DISPATCH(unary_fn, special_entr_stub);
+DECLARE_DISPATCH(unary_fn, special_erfcx_stub);
+DECLARE_DISPATCH(unary_fn, erf_stub);
+DECLARE_DISPATCH(unary_fn, erfc_stub);
+DECLARE_DISPATCH(unary_fn, erfinv_stub);
+DECLARE_DISPATCH(unary_fn, exp_stub);
+DECLARE_DISPATCH(unary_fn, exp2_stub);
+DECLARE_DISPATCH(unary_fn, expm1_stub);
+DECLARE_DISPATCH(unary_fn, floor_stub);
+DECLARE_DISPATCH(unary_fn, frac_stub);
+DECLARE_DISPATCH(unary_fn, frexp_stub);
+DECLARE_DISPATCH(unary_fn, i0_stub);
+DECLARE_DISPATCH(unary_fn, special_i0e_stub);
+DECLARE_DISPATCH(unary_fn, special_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_i1e_stub);
+DECLARE_DISPATCH(unary_fn, log_stub);
+DECLARE_DISPATCH(unary_fn, log10_stub);
+DECLARE_DISPATCH(unary_fn, log1p_stub);
+DECLARE_DISPATCH(unary_fn, log2_stub);
+DECLARE_DISPATCH(unary_fn, special_ndtri_stub);
+DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub);
+DECLARE_DISPATCH(unary_fn, neg_stub);
+DECLARE_DISPATCH(unary_fn, reciprocal_stub);
+DECLARE_DISPATCH(unary_fn, round_stub);
+DECLARE_DISPATCH(unary_fn, rsqrt_stub);
+DECLARE_DISPATCH(unary_fn, sigmoid_stub);
+DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
+DECLARE_DISPATCH(unary_fn, sign_stub);
+DECLARE_DISPATCH(unary_fn, signbit_stub);
+DECLARE_DISPATCH(unary_fn, sgn_stub);
+DECLARE_DISPATCH(unary_fn, sin_stub);
+DECLARE_DISPATCH(unary_fn, sinc_stub);
+DECLARE_DISPATCH(unary_fn, sinh_stub);
+DECLARE_DISPATCH(unary_fn, sqrt_stub);
+DECLARE_DISPATCH(unary_fn, tan_stub);
+DECLARE_DISPATCH(unary_fn, tanh_stub);
+DECLARE_DISPATCH(unary_fn, trigamma_stub);
+DECLARE_DISPATCH(unary_fn, trunc_stub);
+DECLARE_DISPATCH(unary_fn, lgamma_stub);
+DECLARE_DISPATCH(unary_fn, special_airy_ai_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j1_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub);
+// NB: these are actually defined in Distribution
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
+DECLARE_DISPATCH(
+    void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
+    multinomial_with_replacement_stub);
+DECLARE_DISPATCH(
+    void (*)(
+        TensorIteratorBase&,
+        c10::optional<double>,
+        c10::optional<double>,
+        c10::optional<double>),
+    nan_to_num_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub);
+// Missing unary functions
+// digamma
+// lgamma
+// erfinv
+// clone
+// contiguous
+// zero
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h ADDED Viewed

	@@ -0,0 +1,37 @@

+#ifndef ATOMIC_ADD_FLOAT
+#define ATOMIC_ADD_FLOAT
+#if (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))
+#include <ATen/native/cpu/Intrinsics.h>
+#else
+#define _mm_pause()
+#endif
+#include <atomic>
+static inline void cpu_atomic_add_float(float* dst, float fvalue)
+{
+  typedef union {
+    unsigned intV;
+    float floatV;
+  } uf32_t;
+  uf32_t new_value, old_value;
+  std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)(dst);
+  old_value.floatV = *dst;
+  new_value.floatV = old_value.floatV + fvalue;
+  unsigned* old_intV = (unsigned*)(&old_value.intV);
+  while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
+#ifdef __aarch64__
+    __asm__ __volatile__("yield;" : : : "memory");
+#else
+    _mm_pause();
+#endif
+    old_value.floatV = *dst;
+    new_value.floatV = old_value.floatV + fvalue;
+  }
+}
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h ADDED Viewed

	@@ -0,0 +1,12 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/IListRef.h>
+namespace at { namespace native {
+using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t);
+DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub);
+}}  // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+namespace at {
+class TensorBase;
+}
+namespace at { namespace native {
+using channel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t);
+DECLARE_DISPATCH(channel_shuffle_fn, channel_shuffle_kernel);
+}} // at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+/*
+  Depthwise 3x3 Winograd convolution operator
+*/
+namespace at {
+class Tensor;
+namespace native {
+using convolution_depthwise3x3_winograd_fn =
+    Tensor (*)(const Tensor &, const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t);
+DECLARE_DISPATCH(convolution_depthwise3x3_winograd_fn, convolution_depthwise3x3_winograd_stub);
+}  // namespace native
+}  // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Intrinsics.h ADDED Viewed

	@@ -0,0 +1,33 @@

+#pragma once
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+/* Clang-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y])
+#endif
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
+   with the C++ types. => Can still use __bool/__vector */
+#undef bool
+#undef vector
+#undef pixel
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h ADDED Viewed

	@@ -0,0 +1,394 @@

+#pragma once
+// This file provides two functions to help write elementwise kernels:
+//
+//   cpu_kernel(TensorIterator iter, <lambda>)
+//   cpu_kernel_vec(TensorIterator iter, <lambda>, <vec_lambda>)
+//
+// Both functions may generate vectorized code. The cpu_kernel implementation
+// relies on the compiler's auto-vectorization. The cpu_kernel_vec
+// implementation uses x86 SIMD intrinsics when available. These functions
+// are only intended to be used in the ATen/native/cpu subdirectory, since files
+// in other directories are not compiled with AVX/AVX2 enabled. See README.md
+// for more details.
+//
+// For example, to write a multiplication kernel for float:
+//
+//   cpu_kernel(iter, [](float a, float b) { return a * b; });
+//
+// Or you may write:
+//
+//   cpu_kernel_vec(iter,
+//     [](float a, float b) { return a * b; },
+//     [](Vectorized<float> a, Vectorized<float> b) { return a * b; });
+//
+// See BinaryOpsKernel.cpp for the complete implementation
+//
+//
+#include <stdint.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Load.h>
+#include <c10/util/irange.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/cpu/IsContiguous.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cpu/vec/vec.h>
+#include <utility>
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+using namespace vec;
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
+                 std::index_sequence<INDEX...>) {
+  return std::make_tuple(
+      c10::load<typename traits::template arg<INDEX>::type>(
+          data[INDEX] + i * strides[INDEX])...);
+}
+template <typename traits>
+typename traits::ArgsTuple
+dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_impl<traits>(data, strides, i, Indices{});
+}
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_vec_impl(char* C10_RESTRICT data[],
+                     const typename traits::result_type& opt_scalar,
+                     size_t S,
+                     int64_t i,
+                     std::index_sequence<INDEX...>) {
+  using Vec = typename traits::result_type;
+  using scalar_t = typename Vec::value_type;
+  return std::make_tuple(
+      S == INDEX + 1 ?
+      opt_scalar :
+      Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
+}
+template <typename traits>
+typename traits::ArgsTuple
+dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
+}
+template <typename func_t,
+    typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+template <typename func_t,
+    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+// Basic loop operation (one output, N inputs). May be auto-vectorized
+// by the compiler. Supports inputs and outputs of different types.
+template <typename func_t>
+static inline void
+basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  constexpr int ntensors = traits::arity + 1;
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+  execute_op(data, strides, i, n, std::forward<func_t>(op));
+}
+// the recursive variadic template for iterating over the returned tuple
+template<class T, size_t N>
+struct TupleOutput {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    TupleOutput<T, N - 1>::handle(data, strides, i, tuple);
+    auto output = std::get<N - 1>(tuple);
+    using output_type = decltype(output);
+    output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]);
+    *out_ptr = output;
+  }
+};
+// Base case for the above recursive template
+template<class T>
+struct TupleOutput<T, 1> {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    auto output = std::get<0>(tuple);
+    using output_type = decltype(output);
+    output_type* out_ptr = (output_type *)(data[0] + i * strides[0]);
+    *out_ptr = output;
+  }
+};
+template<class... Args>
+void handle_tuple_outputs(char* C10_RESTRICT data[],
+                          const int64_t* strides,
+                          int64_t i,
+                          const std::tuple<Args...> &tuple) {
+  TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
+}
+// Loop operation for `cpu_kernel_multiple_outputs`.
+// 1. Use `c10::guts::apply` to make dynamic method invocation
+//    for the lambda passed in `cpu_kernel_multiple_outputs`.
+// 2. Iterate over the members of the returned tuple, set the corresponding
+//    output tensor by the tuple member in `handle_tuple_outputs` function.
+template <typename func_t>
+static inline void
+multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  constexpr int num_outputs = std::tuple_size<result_type>::value;
+  constexpr int ntensors = traits::arity + num_outputs;
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+  for (; i < n; i++) {
+    auto output = c10::guts::apply(op, dereference<traits>(
+      &data[num_outputs],
+      &strides[num_outputs],
+      i));
+    handle_tuple_outputs(data, strides, i, output);
+  }
+}
+// Explicitly vectorized loop implementation. All inputs and outputs must be
+// the same type and contiguous with one exception: a single input may be
+// a scalar (stride 0). It's position is indicated by the argument `S`. If `S`
+// is 0, then there are no scalar inputs.
+template <typename func_t, typename vec_func_t>
+static inline void
+vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
+  using traits = function_traits<vec_func_t>;
+  using scalar_t = typename function_traits<func_t>::result_type;
+  using Vec = Vectorized<scalar_t>;
+  constexpr int ntensors = traits::arity + 1;
+  char* C10_RESTRICT data[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    data[arg] = data_[arg];
+  }
+  Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0));
+  int64_t i = 0;
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
+    auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
+    auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
+    auto out1 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args1));
+    auto out2 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args2));
+    out1.store(data[0] + i * sizeof(scalar_t));
+    out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
+  }
+  if (i < n) {
+    int64_t strides[ntensors];
+    for (const auto arg : c10::irange(ntensors)) {
+      strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
+    }
+    basic_loop(data, strides, i, n, std::forward<func_t>(op));
+  }
+}
+template <typename traits, typename cb_t>
+static inline void unroll_contiguous_scalar_checks(
+    const int64_t* /*strides*/,
+    std::index_sequence<>,
+    cb_t&& cb) {
+  cb(0);
+}
+template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
+static inline void unroll_contiguous_scalar_checks(
+    const int64_t* strides,
+    std::index_sequence<INDEX0, INDEX...>,
+    cb_t&& cb) {
+  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
+    cb(INDEX0 + 1);
+  } else {
+    unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
+  }
+}
+template <typename op_t, typename vop_t>
+struct VectorizedLoop2d {
+  op_t op;
+  vop_t vop;
+  using traits = function_traits<op_t>;
+  static constexpr int ntensors = traits::arity + 1;
+  using data_t = std::array<char*, ntensors>;
+  VectorizedLoop2d(const op_t &op, vop_t vop):
+    op(op), vop(std::move(vop)) {}
+  static void advance(data_t &data, const int64_t *outer_strides) {
+    for (const auto arg : c10::irange(data.size())) {
+      data[arg] += outer_strides[arg];
+    }
+  }
+  void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
+    data_t data;
+    std::copy_n(base, ntensors, data.data());
+    const int64_t *outer_strides = &strides[ntensors];
+    if (is_contiguous<traits>(strides)) {
+      for (const auto i C10_UNUSED : c10::irange(size1)) {
+        vectorized_loop(data.data(), size0, 0, op, vop);
+        advance(data, outer_strides);
+      }
+    } else {
+      using Indices = std::make_index_sequence<traits::arity>;
+      unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
+        if (idx) {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            vectorized_loop(data.data(), size0, idx, op, vop);
+            advance(data, outer_strides);
+          }
+        } else {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            basic_loop(data.data(), strides, 0, size0, op);
+            advance(data, outer_strides);
+          }
+        }
+      });
+    }
+  }
+};
+template <typename op_t, typename vop_t>
+VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
+    const op_t &op, const vop_t &vop) {
+  return VectorizedLoop2d<op_t, vop_t>(op, vop);
+}
+template <typename func_t>
+void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that
+    // iter.for_each is ever sending to the loop lambda
+      basic_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, grain_size);
+  iter.cast_outputs();
+}
+// This function helps write elementwise kernels that requires multiple outputs.
+// It follows the similar structure of cpu_kernel.
+// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is
+// manipulated to handle multiple return values.
+// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`)
+// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`.
+// The `gpu_kernel_multiple_outputs` is also implemented without this check,
+// We could extend `needs_dynamic_casting` to support both `std::tuple` and
+// `thrust::tuple` in the future.
+template <typename func_t>
+void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    multiple_outputs_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, grain_size);
+  iter.cast_outputs();
+}
+template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
+void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU, but some kernels (like Fill)
+  // explicitly dynamic_cast, so we give the opt-out of checking.
+  if constexpr (check_dynamic_cast) {
+    TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  }
+  iter.for_each(make_vectorized_loop2d(op, vop), grain_size);
+  iter.cast_outputs();
+}
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
+  using traits = function_traits<func_t>;
+  constexpr bool result_void = std::is_void<typename traits::result_type>::value;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
+                        ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1)));
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
+    basic_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, range);
+  iter.cast_outputs();
+}
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
+  cpu_serial_kernel(iter, op, {0, iter.numel()});
+}
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  iter.serial_for_each(make_vectorized_loop2d(op, vop), range);
+  iter.cast_outputs();
+}
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
+  cpu_serial_kernel_vec(iter, op, vop, {0, iter.numel()});
+}
+}}}  // namespace at::native::<anonymous>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace at {
+class Tensor;
+namespace native {
+using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
+}} // at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h ADDED Viewed

	@@ -0,0 +1,238 @@

+#pragma once
+#include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/util/irange.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cpu/utils.h>
+#include <ATen/OpMathType.h>
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+using namespace vec;
+#define AT_DISPATCH_REDUCTION_TYPES(op, ...)                                   \
+  [&] {                                                                        \
+    switch (op) {                                                              \
+      case ReductionType::SUM: {                                               \
+        static constexpr auto reduce = ReductionType::SUM;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MEAN: {                                              \
+        static constexpr auto reduce = ReductionType::MEAN;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MIN: {                                               \
+        static constexpr auto reduce = ReductionType::MIN;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MAX: {                                               \
+        static constexpr auto reduce = ReductionType::MAX;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::PROD: {                                              \
+        static constexpr auto reduce = ReductionType::PROD;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+    }                                                                          \
+  }()
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value() {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val;
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    val = static_cast<acc_t>(0);
+  } else if (reduce == ReductionType::PROD) {
+    val = static_cast<acc_t>(1);
+  } else if (reduce == ReductionType::MAX) {
+    val = -std::numeric_limits<acc_t>::infinity();
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    val = std::numeric_limits<acc_t>::infinity();
+  }
+  return val;
+}
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (initial.has_value()) {
+    return initial.value().to<acc_t>();
+  } else {
+    return init_value<scalar_t, reduce>();
+  }
+}
+template <typename scalar_t>
+inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
+  using Vec = Vectorized<vec_scalar_t<scalar_t>>;
+  map<scalar_t>(
+      [val](Vec x) { return Vec(val); },
+      out,
+      out,
+      size);
+}
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val = init_value<scalar_t, reduce>(initial);
+  init(out, size, val);
+}
+// overload with `include_self`, used by scatter_reduce
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, bool include_self = false) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (!include_self) {
+    acc_t val = init_value<scalar_t, reduce>();
+    init(out, size, val);
+  }
+}
+template <typename scalar_t, ReductionType reduce>
+inline void _init(scalar_t* self_ptr, at::opmath_type<scalar_t>* buffer_ptr, int64_t size, bool include_self) {
+  if (!include_self) {
+    init<at::opmath_type<scalar_t>, reduce>(buffer_ptr, size, include_self);
+  } else {
+    vec::convert(self_ptr, buffer_ptr, size);
+  }
+}
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_max(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::max(x, y);
+}
+template <typename scalar_t>
+inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::maximum propagates NaN
+  return vec::maximum(x, y);
+}
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_max(const vec_t& x, const vec_t& y) {
+  // vec::maximum propagates NaN
+  return maximum(x, y);
+}
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_min(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::min(x, y);
+}
+template <typename scalar_t>
+inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::minimum propagates NaN
+  return vec::minimum(x, y);
+}
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_min(const vec_t& x, const vec_t& y) {
+  // vec::minimum propagates NaN
+  return minimum(x, y);
+}
+template <typename scalar_t, typename accumut, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map_acc(
+    const Op& vec_fun,
+    accumut* output_data,
+    const accumut* input_data,
+    const scalar_t* input_data2,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  using aVec = vec::Vectorized<accumut>;
+  int64_t d = 0;
+  constexpr int64_t kVecSize = Vec::size();
+  constexpr int64_t kaVecSize = aVec::size();
+  for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
+    Vec data2_vec = Vec::loadu(input_data2 + d);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    aVec input_vec0 = aVec::loadu(input_data + d);
+    aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize);
+    vec_fun(input_vec0, data2_avec0).store(output_data + d);
+    vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize);
+  }
+  if (size - d > 0) {
+    int64_t tail_size = size - d;
+    Vec data2_vec = Vec::loadu(input_data2 + d, tail_size);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    if (tail_size > kaVecSize) {
+      aVec input_vec0 = aVec::loadu(input_data + d);
+      aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d);
+      vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize);
+    } else {
+      aVec input_vec0 = aVec::loadu(input_data + d, tail_size);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size);
+    }
+  }
+}
+// for Max and Min, propagate NaN:
+template <typename T, ReductionType reduce>
+inline T update(const T& x, const T& y) {
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    return x + y;
+  } else if (reduce == ReductionType::PROD) {
+    return x * y;
+  } else if (reduce == ReductionType::MAX) {
+    return _max(x, y);
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    return _min(x, y);
+  }
+}
+template <typename scalar_t, ReductionType reduce>
+inline void update(scalar_t* out, const scalar_t* data, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  map2<scalar_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+template <typename scalar_t, ReductionType reduce,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void update(at::opmath_type<scalar_t>* out, const scalar_t* data, int64_t K) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using Vec = vec::Vectorized<opmath_t>;
+  map_acc<scalar_t, opmath_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+template <typename scalar_t, ReductionType reduce>
+inline void write(scalar_t* out, int64_t count, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  if (reduce == ReductionType::MEAN) {
+    if (count > 0) {
+      vec::map<scalar_t>(
+          [count](Vec x) { return x / Vec(count); },
+          out,
+          out,
+          K);
+    }
+  }
+}
+} // namespace CPU_CAPABILITY
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+namespace at::native {
+using spmm_reduce_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_other_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+DECLARE_DISPATCH(spmm_reduce_fn, spmm_reduce_stub);
+DECLARE_DISPATCH(spmm_reduce_arg_fn, spmm_reduce_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_fn, spmm_reduce_backward_input_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_input_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_other_fn, spmm_reduce_backward_other_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_other_arg_stub);
+} // at::native