koichi12 commited on Feb 12, 2025

Commit

d9bcc7f

verified ·

1 Parent(s): 7e7bbc5

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/torch/_inductor/async_compile.py +297 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__init__.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__pycache__/autoheuristic_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py +296 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py +321 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py +150 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py +149 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_PadMMA100.py +109 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/autoheuristic.py +315 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/autoheuristic_utils.py +339 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/learned_heuristic_controller.py +119 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/learnedheuristic_interface.py +92 -0
.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py +876 -0
.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/comm_analysis.py +264 -0
.venv/lib/python3.11/site-packages/torch/_inductor/comms.py +640 -0
.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py +1629 -0
.venv/lib/python3.11/site-packages/torch/_inductor/config.py +1241 -0
.venv/lib/python3.11/site-packages/torch/_inductor/constant_folding.py +348 -0
.venv/lib/python3.11/site-packages/torch/_inductor/cpu_vec_isa.py +373 -0
.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_trees.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py +330 -0
.venv/lib/python3.11/site-packages/torch/_inductor/debug.py +693 -0
.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py +980 -0
.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py +745 -0
.venv/lib/python3.11/site-packages/torch/_inductor/exc.py +104 -0
.venv/lib/python3.11/site-packages/torch/_inductor/extern_node_serializer.py +25 -0
.venv/lib/python3.11/site-packages/torch/_inductor/freezing.py +269 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py +251 -0
.venv/lib/python3.11/site-packages/torch/_inductor/graph.py +1930 -0
.venv/lib/python3.11/site-packages/torch/_inductor/hooks.py +30 -0
.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py +373 -0
.venv/lib/python3.11/site-packages/torch/_inductor/inductor_prims.py +179 -0
.venv/lib/python3.11/site-packages/torch/_inductor/ir.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/jagged_lowerings.py +264 -0
.venv/lib/python3.11/site-packages/torch/_inductor/lowering.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py +436 -0
.venv/lib/python3.11/site-packages/torch/_inductor/mkldnn_ir.py +1881 -0
.venv/lib/python3.11/site-packages/torch/_inductor/mkldnn_lowerings.py +1087 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/__init__.py +1 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/__pycache__/build_package.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/__pycache__/package.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/build_package.py +15 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/package.py +237 -0
.venv/lib/python3.11/site-packages/torch/_inductor/package/pt2_archive_constants.py +16 -0
.venv/lib/python3.11/site-packages/torch/_inductor/pattern_matcher.py +2005 -0
.venv/lib/python3.11/site-packages/torch/_inductor/quantized_lowerings.py +92 -0
.venv/lib/python3.11/site-packages/torch/_inductor/remote_cache.py +198 -0
.venv/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py +1743 -0

.venv/lib/python3.11/site-packages/torch/_inductor/async_compile.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import functools
+import logging
+import multiprocessing
+import os
+import sys
+from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
+from concurrent.futures.process import BrokenProcessPool
+from functools import partial
+from time import time
+from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING
+import torch
+from torch._dynamo.device_interface import get_registered_device_interfaces
+from torch._inductor import config
+from torch._inductor.codecache import (
+    CodeCacheFuture,
+    CppCodeCache,
+    CppPythonBindingsCodeCache,
+    CUDACodeCache,
+    HalideCodeCache,
+    LambdaFuture,
+    ROCmCodeCache,
+    TritonCodeCache,
+    TritonFuture,
+)
+from torch._inductor.compile_worker.subproc_pool import (
+    _warm_process_pool,
+    AnyPool,
+    SubprocPool,
+)
+from torch._inductor.compile_worker.watchdog import _async_compile_initializer
+from torch._inductor.runtime.compile_tasks import (
+    _set_triton_ptxas_path,
+    _worker_compile_triton,
+)
+from torch.hub import _Faketqdm, tqdm
+from torch.utils._triton import has_triton_package
+if TYPE_CHECKING:
+    from torch._inductor.runtime.hints import HalideMeta
+# timing metrics for time spent in the compilation
+_cumulative_compile_time = 0.0
+_t0: Optional[float] = None
+kernel_code_log = torch._logging.getArtifactLogger(__name__, "kernel_code")
+def pre_fork_setup():
+    """
+    Setup that must be done prior to forking with a process pool.
+    """
+    # ensure properties have been calculated before processes
+    # are forked
+    caching_device_properties()
+    # Computing the triton key can be slow. If we call it before fork,
+    # it will be cached for the forked subprocesses.
+    try:
+        from triton.compiler.compiler import triton_key
+        triton_key()
+    except ImportError:
+        # Triton might not be installed or might be an old version.
+        pass
+def caching_device_properties():
+    for _, device_interface in get_registered_device_interfaces():
+        if device_interface.is_available():
+            device_interface.Worker.get_device_properties()
+def _compile_start() -> None:
+    global _t0
+    if _t0 is None:
+        _t0 = time()
+def _compile_end() -> None:
+    global _cumulative_compile_time, _t0
+    if _t0 is not None:
+        t1 = time()
+        _cumulative_compile_time += t1 - _t0
+        _t0 = None
+        # print("CUMULATIVE COMPILE TIME", _cumulative_compile_time)
+_IS_WINDOWS = sys.platform == "win32"
+log = logging.getLogger(__name__)
+# Used to keep track of all process pools invoked so far.
+_pool_set: Set[AnyPool] = set()
+def shutdown_compile_workers() -> None:
+    """Shut down all outstanding compile-worker pools."""
+    for pool in _pool_set:
+        pool.shutdown()
+    after_fork()
+def after_fork():
+    """Reset pools to initial state without shutting them down"""
+    _pool_set.clear()
+    AsyncCompile.process_pool.cache_clear()
+try:
+    os.register_at_fork(after_in_child=after_fork)
+except AttributeError:
+    pass  # register_at_fork does not exists on windows
+class AsyncCompile:
+    def __init__(self) -> None:
+        pass
+    @staticmethod
+    @functools.lru_cache(1)
+    def pool() -> ThreadPoolExecutor:
+        assert config.compile_threads > 1
+        return ThreadPoolExecutor(config.compile_threads)
+    @staticmethod
+    def _get_ready():
+        """No-op function to help mark when the subprocess pool is ready."""
+        return "ready"
+    @staticmethod
+    @functools.lru_cache(1)
+    def process_pool() -> AnyPool:
+        assert config.compile_threads > 1
+        pool: AnyPool
+        if config.worker_start_method == "subprocess":
+            # Wrapper around ProcessPoolExecutor forks in a new process we control
+            pool = SubprocPool(config.compile_threads)
+        else:
+            pre_fork_setup()
+            ctx = multiprocessing.get_context(config.worker_start_method)
+            pool = ProcessPoolExecutor(
+                config.compile_threads,
+                mp_context=ctx,
+                initializer=partial(_async_compile_initializer, os.getpid()),
+            )
+            # when this pool is created in a subprocess object, the normal exit handler
+            # doesn't run, and we need to register our own handler.
+            # exitpriority has to be high, because another one of the finalizers will
+            # kill the worker thread that sends the shutdown message to the workers...
+            multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
+        # Set an attribute we can check to see if the pool is ready.
+        pool.ready_future = pool.submit(AsyncCompile._get_ready)  # type: ignore[union-attr]
+        _pool_set.add(pool)
+        return pool
+    @classmethod
+    def warm_pool(cls) -> None:
+        if config.compile_threads <= 1:
+            return
+        _compile_start()
+        _warm_process_pool(cls.process_pool(), config.compile_threads)
+        _compile_end()
+    @classmethod
+    def submit(cls, task: Callable[..., Any]) -> Any:
+        if config.compile_threads <= 1:
+            return task()
+        return cls.pool().submit(task)
+    def _use_process_pool(self):
+        return (
+            config.compile_threads > 1
+            and self.process_pool().ready_future.done()  # type: ignore[union-attr]
+        )
+    def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
+        kernel_code_log.info("Triton Kernel:\n%s", source_code)
+        _compile_start()
+        _set_triton_ptxas_path()
+        kernel = TritonCodeCache.load(kernel_name, source_code)
+        if self._use_process_pool():
+            # We want to support changing these env vars after (and while) the
+            # process pool is running, so pass them to the subprocess to reset.
+            env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
+            extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
+            return TritonFuture(
+                kernel,
+                self.process_pool().submit(
+                    _worker_compile_triton,
+                    kernel._reload_in_subproc,
+                    extra_env,
+                ),
+            )
+        else:
+            kernel.precompile()
+            return kernel
+    def multi_kernel(self, *args, **kwargs) -> Any:
+        from torch._inductor.codegen.multi_kernel import MultiKernelCall
+        # no need to call this in parallel since the sub-kernels are already parallel tasks
+        return MultiKernelCall(*args, **kwargs)
+    def cpp(self, source_code: str):
+        kernel_code_log.info("CPP Kernel:\n%s", source_code)
+        if config.compile_threads <= 1:
+            return CppCodeCache.load(source_code).kernel
+        else:
+            get_result = CppCodeCache.load_async(source_code, submit_fn=self.submit)
+            return LambdaFuture(lambda: get_result().kernel)
+    def cpp_pybinding(self, argtypes: List[str], source_code: str):
+        kernel_code_log.info("CPP+Bindings Kernel:\n%s", source_code)
+        if config.compile_threads <= 1:
+            return CppPythonBindingsCodeCache.load_pybinding(argtypes, source_code)
+        else:
+            get_result = CppPythonBindingsCodeCache.load_pybinding_async(
+                argtypes, source_code, submit_fn=self.submit
+            )
+            return LambdaFuture(get_result)
+    def cuda(self, source_code, dst_file_ext):
+        kernel_code_log.info("CUDA Kernel:\n%s", source_code)
+        def task():
+            return CUDACodeCache.load(source_code, dst_file_ext)[0]
+        return self.submit(task)
+    def rocm(self, source_code, dst_file_ext):
+        kernel_code_log.info("ROCm Kernel:\n%s", source_code)
+        def task():
+            return ROCmCodeCache.load(source_code, dst_file_ext)[0]
+        return self.submit(task)
+    def halide(self, meta: HalideMeta, source_code: str):
+        kernel_code_log.info("Halide Kernel:\n%r\n%s", meta, source_code)
+        if config.compile_threads <= 1:
+            return HalideCodeCache.generate_halide(meta, source_code)
+        else:
+            get_result = HalideCodeCache.generate_halide_async(
+                meta, source_code, submit_fn=self.submit
+            )
+            return LambdaFuture(get_result)
+    def wait(self, scope: Dict[str, Any]) -> None:
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, CodeCacheFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=0,
+        )
+        if config.compile_threads > 1:
+            for key, result in scope.items():
+                if config.verbose_progress and not isinstance(pbar, _Faketqdm):
+                    pbar.set_postfix_str(key)
+                if isinstance(result, (Future, CodeCacheFuture)):
+                    try:
+                        scope[key] = result.result()
+                    except BrokenProcessPool as e:
+                        raise RuntimeError(
+                            "A compilation subprocess exited unexpectedly. This "
+                            "is likely due to a crash. To facilitate debugging, "
+                            "you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 "
+                            "to cause compilation to occur in the main process."
+                        ) from e
+                    pbar.update(1)
+        _compile_end()
+if (
+    os.environ.get("TORCH_TNT_IN_USE", "0") == "1"
+    or os.environ.get("TORCH_WARM_POOL", "1") != "1"
+    # The subprocess pool is only used for the Triton backend
+    or not has_triton_package()
+):
+    pass
+else:
+    AsyncCompile.warm_pool()

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/__pycache__/autoheuristic_utils.cpython-311.pyc ADDED Viewed

Binary file (19 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MMRankingA100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+    def get_name(self) -> str:
+        return 'mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 52.6245059967041:
+            if context.get_value('n') <= 34.0:
+                if context.get_value('n') <= 18.0:
+                    if context.get_value('k*n') <= 312.0:
+                        return [(0.093, 12), (0.081, 16), (0.081, 148), (0.070, 10), (0.070, 17), (0.070, 149), (0.070, 151), (0.070, 150), (0.070, 14), (0.058, 11), (0.058, 15), (0.058, 13), (0.058, 122), (0.047, 121), (0.035, 123), (0.012, 92)]
+                    else:
+                        if context.get_value('k') <= 40.0:
+                            return [(0.083, 42), (0.083, 46), (0.083, 44), (0.083, 40), (0.083, 128), (0.067, 45), (0.067, 43), (0.067, 41), (0.067, 169), (0.067, 171), (0.067, 168), (0.067, 129), (0.067, 170), (0.033, 103), (0.017, 121)]
+                        else:
+                            return [(0.112, 137), (0.104, 136), (0.101, 0), (0.081, 1), (0.073, 135), (0.069, 67), (0.066, 187), (0.058, 41), (0.050, 71), (0.046, 68), (0.046, 70), (0.031, 44), (0.027, 43), (0.027, 170), (0.019, 189), (0.019, 188), (0.015, 169), (0.015, 171), (0.012, 115), (0.012, 168), (0.012, 69), (0.004, 103)]
+                else:
+                    if context.get_value('mat1_stride_0') <= 20.0:
+                        return [(0.069, 0), (0.059, 157), (0.059, 22), (0.059, 153), (0.059, 155), (0.059, 25), (0.059, 23), (0.059, 19), (0.044, 21), (0.044, 18), (0.044, 152), (0.044, 158), (0.044, 154), (0.044, 156), (0.044, 20), (0.044, 124), (0.044, 24), (0.030, 125), (0.029, 126), (0.015, 97), (0.015, 95), (0.015, 96), (0.010, 2), (0.010, 75)]
+                    else:
+                        if context.get_value('k') <= 68.0:
+                            return [(0.087, 72), (0.087, 74), (0.087, 73), (0.086, 76), (0.077, 75), (0.067, 192), (0.058, 190), (0.048, 47), (0.048, 193), (0.048, 49), (0.048, 51), (0.048, 191), (0.038, 53), (0.019, 133), (0.019, 50), (0.019, 175), (0.019, 172), (0.019, 48), (0.019, 174), (0.010, 173), (0.010, 177), (0.010, 52), (0.010, 54), (0.010, 178), (0.010, 176)]
+                        else:
+                            return [(0.154, 52), (0.154, 72), (0.102, 75), (0.087, 49), (0.087, 73), (0.086, 51), (0.057, 176), (0.045, 2), (0.038, 191), (0.038, 178), (0.038, 190), (0.029, 173), (0.029, 76), (0.026, 138), (0.013, 139), (0.013, 140), (0.003, 0)]
+            else:
+                if context.get_value('k') <= 35.0:
+                    if context.get_value('k') <= 18.0:
+                        if context.get_value('m*n') <= 19505152.0:
+                            return [(0.151, 159), (0.140, 160), (0.129, 164), (0.055, 127), (0.051, 29), (0.044, 161), (0.044, 147), (0.040, 146), (0.040, 31), (0.037, 145), (0.026, 28), (0.022, 90), (0.022, 93), (0.022, 94), (0.022, 100), (0.022, 125), (0.022, 158), (0.022, 157), (0.011, 87), (0.011, 88), (0.011, 89), (0.011, 91), (0.011, 95), (0.011, 96), (0.011, 98), (0.011, 99)]
+                        else:
+                            return [(0.069, 7), (0.069, 5), (0.067, 147), (0.066, 8), (0.061, 145), (0.058, 146), (0.052, 124), (0.049, 29), (0.049, 159), (0.046, 31), (0.043, 157), (0.041, 9), (0.041, 4), (0.040, 6), (0.035, 164), (0.035, 160), (0.026, 158), (0.017, 125), (0.017, 28), (0.017, 32), (0.017, 162), (0.017, 27), (0.017, 30), (0.017, 161), (0.009, 33), (0.009, 26), (0.009, 163), (0.006, 0)]
+                    else:
+                        if context.get_value('n') <= 68.0:
+                            return [(0.101, 182), (0.101, 59), (0.088, 57), (0.076, 184), (0.076, 61), (0.076, 179), (0.076, 62), (0.076, 58), (0.063, 180), (0.063, 60), (0.051, 56), (0.050, 181), (0.025, 130), (0.025, 177), (0.025, 183), (0.013, 178), (0.013, 55)]
+                        else:
+                            return [(0.089, 180), (0.079, 60), (0.066, 35), (0.066, 181), (0.066, 38), (0.066, 58), (0.066, 179), (0.066, 57), (0.062, 184), (0.053, 37), (0.044, 166), (0.040, 55), (0.040, 39), (0.040, 36), (0.040, 165), (0.040, 167), (0.027, 177), (0.027, 34), (0.022, 159)]
+                else:
+                    if context.get_value('m*n') <= 309760.0:
+                        return [(0.298, 0), (0.097, 140), (0.080, 83), (0.072, 86), (0.044, 84), (0.036, 178), (0.036, 117), (0.036, 82), (0.032, 120), (0.032, 85), (0.028, 119), (0.024, 130), (0.024, 109), (0.020, 108), (0.020, 118), (0.012, 104), (0.012, 116), (0.012, 141), (0.012, 144), (0.008, 105), (0.008, 106), (0.008, 111), (0.008, 114), (0.008, 107), (0.008, 132), (0.004, 101), (0.004, 102), (0.004, 110), (0.004, 112), (0.004, 113), (0.004, 131)]
+                    else:
+                        if context.get_value('n') <= 72.0:
+                            return [(0.227, 77), (0.118, 78), (0.102, 194), (0.086, 80), (0.059, 57), (0.054, 81), (0.049, 196), (0.048, 197), (0.048, 59), (0.043, 79), (0.032, 195), (0.027, 180), (0.022, 3), (0.021, 141), (0.016, 60), (0.016, 142), (0.011, 183), (0.011, 0), (0.011, 144)]
+                        else:
+                            return [(0.140, 186), (0.132, 185), (0.109, 63), (0.085, 65), (0.078, 37), (0.077, 35), (0.062, 197), (0.047, 194), (0.046, 165), (0.046, 57), (0.039, 78), (0.039, 79), (0.039, 66), (0.039, 64), (0.016, 195), (0.008, 159)]
+        else:
+            if str(context.get_value('using_tf32')) != 'False':
+                if context.get_value('m*n') <= 815360.0:
+                    if context.get_value('k') <= 1184.0:
+                        return [(0.218, 140), (0.205, 0), (0.154, 144), (0.115, 141), (0.051, 185), (0.051, 104), (0.039, 78), (0.038, 116), (0.026, 165), (0.026, 130), (0.026, 178), (0.013, 57), (0.013, 195), (0.013, 167), (0.013, 186)]
+                    else:
+                        return [(0.901, 0), (0.030, 144), (0.030, 134), (0.016, 3), (0.006, 78), (0.006, 77), (0.002, 57), (0.002, 194), (0.002, 59), (0.002, 60), (0.002, 143)]
+                else:
+                    if context.get_value('arith_intensity') <= 187.23922729492188:
+                        if context.get_value('mat1_stride_0') <= 198.0:
+                            return [(0.273, 63), (0.158, 37), (0.152, 35), (0.127, 57), (0.097, 165), (0.053, 185), (0.031, 0), (0.028, 64), (0.014, 60), (0.014, 78), (0.009, 55), (0.008, 134), (0.005, 34), (0.005, 167), (0.005, 179), (0.005, 65), (0.005, 66), (0.005, 186), (0.005, 194), (0.002, 166)]
+                        else:
+                            return [(0.296, 63), (0.235, 0), (0.132, 64), (0.074, 37), (0.069, 78), (0.051, 185), (0.051, 35), (0.030, 57), (0.020, 77), (0.016, 194), (0.008, 66), (0.007, 65), (0.003, 3), (0.003, 165), (0.003, 141), (0.001, 134), (0.001, 166)]
+                    else:
+                        return [(0.405, 0), (0.246, 37), (0.177, 63), (0.145, 35), (0.005, 185), (0.005, 65), (0.005, 64), (0.004, 57), (0.003, 66), (0.002, 165), (0.001, 78), (0.001, 55)]
+            else:
+                return [(0.357, 0), (0.112, 165), (0.101, 57), (0.094, 179), (0.086, 64), (0.074, 167), (0.067, 60), (0.064, 159), (0.033, 35), (0.007, 195), (0.002, 180), (0.001, 34), (0.001, 166), (0.001, 78)]

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MMRankingH100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 232448
+            and str(metadata.device_capa) == "(9, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+    def get_name(self) -> str:
+        return 'mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 29.89772129058838:
+            if context.get_value('n') <= 34.0:
+                if context.get_value('n') <= 18.0:
+                    if context.get_value('k*n') <= 432.0:
+                        if context.get_value('arith_intensity') <= 7.8700292110443115:
+                            return [(0.098, 128), (0.098, 129), (0.098, 127), (0.073, 14), (0.073, 16), (0.073, 12), (0.073, 154), (0.073, 156), (0.073, 157), (0.073, 155), (0.049, 10), (0.049, 94), (0.049, 95), (0.048, 96)]
+                        else:
+                            return [(0.091, 154), (0.073, 10), (0.073, 15), (0.073, 13), (0.073, 11), (0.073, 17), (0.073, 16), (0.073, 14), (0.073, 12), (0.055, 127), (0.054, 157), (0.054, 156), (0.054, 155), (0.036, 129), (0.036, 128), (0.018, 41), (0.018, 43)]
+                    else:
+                        if context.get_value('k') <= 40.0:
+                            return [(0.070, 39), (0.069, 45), (0.069, 41), (0.069, 43), (0.069, 111), (0.069, 112), (0.056, 38), (0.056, 40), (0.056, 42), (0.056, 44), (0.056, 174), (0.056, 173), (0.056, 175), (0.056, 134), (0.056, 172), (0.056, 135), (0.014, 154), (0.014, 127)]
+                        else:
+                            return [(0.147, 144), (0.119, 143), (0.087, 142), (0.083, 0), (0.073, 191), (0.059, 69), (0.050, 67), (0.046, 70), (0.041, 1), (0.036, 174), (0.032, 43), (0.032, 123), (0.028, 40), (0.027, 42), (0.027, 173), (0.023, 175), (0.018, 66), (0.014, 192), (0.014, 193), (0.014, 139), (0.014, 68), (0.014, 127)]
+                else:
+                    if context.get_value('mat1_stride_0') <= 40.0:
+                        if context.get_value('mat1_stride_0') <= 20.0:
+                            return [(0.109, 23), (0.109, 21), (0.109, 20), (0.088, 0), (0.087, 131), (0.066, 18), (0.065, 130), (0.065, 132), (0.065, 159), (0.065, 160), (0.065, 161), (0.065, 158), (0.022, 22), (0.022, 19)]
+                        else:
+                            return [(0.065, 46), (0.064, 52), (0.064, 50), (0.064, 48), (0.064, 51), (0.064, 49), (0.064, 47), (0.064, 53), (0.064, 181), (0.064, 177), (0.064, 179), (0.064, 176), (0.038, 130), (0.038, 136), (0.026, 182), (0.026, 178), (0.026, 180), (0.026, 137), (0.025, 158), (0.013, 114), (0.013, 113)]
+                    else:
+                        if context.get_value('mat1_stride_0') <= 68.0:
+                            return [(0.138, 140), (0.125, 195), (0.100, 71), (0.100, 74), (0.100, 196), (0.100, 194), (0.100, 197), (0.075, 75), (0.062, 72), (0.062, 73), (0.012, 180), (0.012, 51), (0.012, 182)]
+                        else:
+                            return [(0.124, 180), (0.124, 182), (0.114, 75), (0.103, 74), (0.093, 51), (0.093, 71), (0.072, 72), (0.062, 194), (0.052, 145), (0.052, 195), (0.021, 48), (0.021, 50), (0.021, 47), (0.020, 124), (0.010, 147), (0.010, 146), (0.010, 46)]
+            else:
+                if context.get_value('k') <= 18.0:
+                    if context.get_value('m*k') <= 528.0:
+                        return [(0.097, 88), (0.087, 92), (0.077, 90), (0.058, 105), (0.058, 103), (0.058, 104), (0.058, 99), (0.058, 100), (0.058, 106), (0.058, 93), (0.057, 91), (0.057, 97), (0.057, 98), (0.057, 101), (0.048, 102), (0.029, 87), (0.029, 89)]
+                    else:
+                        if context.get_value('n') <= 80.0:
+                            return [(0.057, 161), (0.057, 130), (0.057, 24), (0.056, 164), (0.056, 163), (0.056, 166), (0.056, 168), (0.056, 30), (0.056, 28), (0.056, 26), (0.056, 25), (0.056, 27), (0.056, 29), (0.056, 31), (0.042, 131), (0.028, 99), (0.028, 101), (0.028, 100), (0.028, 167), (0.028, 165), (0.028, 133)]
+                        else:
+                            return [(0.110, 164), (0.108, 163), (0.106, 168), (0.069, 161), (0.066, 151), (0.060, 152), (0.055, 165), (0.050, 27), (0.050, 29), (0.048, 131), (0.043, 153), (0.037, 133), (0.037, 130), (0.028, 8), (0.028, 5), (0.027, 7), (0.026, 26), (0.016, 162), (0.012, 9), (0.007, 4), (0.005, 100), (0.005, 6), (0.005, 24)]
+                else:
+                    if context.get_value('k') <= 36.0:
+                        if context.get_value('n') <= 68.0:
+                            return [(0.097, 184), (0.097, 56), (0.086, 186), (0.086, 183), (0.086, 188), (0.086, 58), (0.086, 60), (0.065, 54), (0.043, 187), (0.043, 185), (0.043, 57), (0.043, 61), (0.032, 55), (0.032, 130), (0.032, 59), (0.011, 181), (0.011, 163), (0.011, 136), (0.011, 138)]
+                        else:
+                            return [(0.117, 184), (0.117, 170), (0.117, 169), (0.107, 183), (0.106, 188), (0.075, 181), (0.064, 130), (0.064, 56), (0.053, 171), (0.032, 57), (0.032, 59), (0.032, 185), (0.011, 163), (0.011, 32), (0.011, 37), (0.011, 34), (0.011, 33), (0.011, 35), (0.011, 36), (0.011, 54)]
+                    else:
+                        if context.get_value('mat2_stride_0') <= 384.0:
+                            return [(0.244, 0), (0.061, 76), (0.061, 79), (0.030, 3), (0.030, 183), (0.030, 189), (0.030, 187), (0.030, 64), (0.030, 190), (0.030, 62), (0.030, 198), (0.030, 201), (0.030, 77), (0.030, 200), (0.030, 80), (0.030, 199), (0.030, 78), (0.030, 184), (0.020, 86), (0.020, 84), (0.020, 120), (0.020, 81), (0.020, 121), (0.020, 85), (0.020, 122), (0.010, 83), (0.010, 118), (0.010, 119), (0.010, 82)]
+                        else:
+                            return [(0.274, 83), (0.171, 86), (0.152, 0), (0.071, 85), (0.061, 125), (0.050, 84), (0.020, 109), (0.020, 117), (0.020, 81), (0.020, 118), (0.020, 121), (0.020, 108), (0.020, 115), (0.020, 116), (0.010, 110), (0.010, 120), (0.010, 103), (0.010, 107), (0.010, 119), (0.010, 122)]
+        else:
+            if context.get_value('arith_intensity') <= 56.995582580566406:
+                if context.get_value('n') <= 68.0:
+                    if context.get_value('k*n') <= 4448.0:
+                        if context.get_value('m*n') <= 29626368.0:
+                            return [(0.107, 198), (0.107, 200), (0.107, 201), (0.107, 199), (0.106, 76), (0.106, 79), (0.064, 197), (0.063, 56), (0.043, 184), (0.043, 187), (0.042, 80), (0.042, 77), (0.042, 183), (0.021, 78)]
+                        else:
+                            return [(0.073, 201), (0.073, 198), (0.073, 200), (0.073, 199), (0.073, 197), (0.073, 56), (0.073, 58), (0.073, 79), (0.073, 76), (0.072, 59), (0.072, 78), (0.072, 77), (0.072, 80), (0.018, 184), (0.018, 55), (0.018, 54)]
+                    else:
+                        if context.get_value('k') <= 348.0:
+                            return [(0.206, 76), (0.183, 77), (0.169, 198), (0.160, 199), (0.053, 59), (0.046, 56), (0.038, 3), (0.030, 148), (0.030, 58), (0.030, 187), (0.023, 184), (0.015, 0), (0.008, 55), (0.008, 54)]
+                        else:
+                            return [(0.146, 198), (0.145, 199), (0.145, 148), (0.126, 0), (0.084, 76), (0.084, 77), (0.042, 80), (0.042, 79), (0.021, 149), (0.021, 150), (0.021, 3), (0.014, 46), (0.014, 74), (0.014, 75), (0.014, 124), (0.014, 194), (0.014, 195), (0.007, 145), (0.007, 146), (0.007, 2), (0.007, 72), (0.007, 147), (0.007, 71)]
+                else:
+                    if context.get_value('m') <= 3264.0:
+                        return [(0.247, 147), (0.115, 197), (0.066, 199), (0.066, 201), (0.066, 198), (0.049, 0), (0.049, 169), (0.049, 171), (0.033, 140), (0.033, 125), (0.033, 114), (0.016, 126), (0.016, 183), (0.016, 184), (0.016, 185), (0.016, 182), (0.016, 188), (0.016, 78), (0.016, 148), (0.016, 138), (0.016, 77), (0.016, 56), (0.016, 59)]
+                    else:
+                        if context.get_value('k') <= 62.5:
+                            return [(0.226, 190), (0.226, 189), (0.122, 62), (0.122, 64), (0.055, 77), (0.055, 78), (0.037, 198), (0.036, 201), (0.036, 33), (0.024, 163), (0.018, 56), (0.018, 35), (0.018, 169), (0.006, 171)]
+                        else:
+                            return [(0.162, 35), (0.118, 33), (0.096, 189), (0.096, 190), (0.088, 169), (0.074, 62), (0.073, 56), (0.066, 171), (0.051, 198), (0.051, 201), (0.044, 59), (0.037, 64), (0.029, 63), (0.007, 0), (0.007, 77)]
+            else:
+                if context.get_value('m*n') <= 1097728.0:
+                    return [(0.403, 0), (0.179, 141), (0.134, 150), (0.086, 147), (0.051, 148), (0.048, 3), (0.024, 189), (0.020, 199), (0.017, 64), (0.010, 65), (0.010, 77), (0.007, 114), (0.003, 138), (0.003, 59), (0.003, 182)]
+                else:
+                    if context.get_value('m*n') <= 3244032.0:
+                        return [(0.295, 189), (0.176, 64), (0.157, 65), (0.090, 0), (0.069, 62), (0.059, 63), (0.046, 77), (0.039, 169), (0.023, 199), (0.020, 35), (0.013, 33), (0.010, 171), (0.003, 141)]
+                    else:
+                        if context.get_value('n') <= 136.0:
+                            return [(0.197, 189), (0.197, 63), (0.161, 77), (0.157, 62), (0.061, 33), (0.044, 65), (0.039, 35), (0.039, 64), (0.030, 169), (0.026, 0), (0.017, 199), (0.017, 148), (0.009, 56), (0.004, 3)]
+                        else:
+                            return [(0.460, 0), (0.145, 62), (0.138, 63), (0.081, 35), (0.047, 33), (0.043, 189), (0.023, 64), (0.018, 77), (0.013, 169), (0.009, 65), (0.009, 56), (0.005, 32), (0.005, 59), (0.002, 183), (0.002, 163)]

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MixedMMA100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_fallback_mixed_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+    def get_name(self) -> str:
+        return 'mixed_mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:
+        if str(context.get_value('1LEQmLEQ16')) != 'True':
+            if context.get_value('m') <= 32.5:
+                if context.get_value('n') <= 6976.0:
+                    if context.get_value('n') <= 3520.0:
+                        if context.get_value('m*n') <= 37632.0:
+                            return None
+                        else:
+                            return [(1.000, 13)]
+                    else:
+                        if context.get_value('m*k') <= 452352.0:
+                            return [(0.590, 13), (0.256, 8), (0.103, 7), (0.051, 11)]
+                        else:
+                            return [(0.778, 8), (0.222, 13)]
+                else:
+                    if context.get_value('k*n') <= 102776832.0:
+                        if context.get_value('n') <= 14656.0:
+                            return [(1.000, 11)]
+                        else:
+                            return [(0.889, 11), (0.111, 13)]
+                    else:
+                        return [(1.000, 11)]
+            else:
+                if context.get_value('m*n') <= 446464.0:
+                    if context.get_value('m*n') <= 223424.0:
+                        if context.get_value('mat1_stride_0') <= 3968.0:
+                            return None
+                        else:
+                            return None
+                    else:
+                        if context.get_value('m*n') <= 346112.0:
+                            return [(0.960, 16), (0.040, 7)]
+                        else:
+                            return [(0.750, 16), (0.136, 14), (0.114, 7)]
+                else:
+                    if str(context.get_value('33LEQmLEQ64')) != 'True':
+                        if context.get_value('n') <= 6976.0:
+                            return [(1.000, 14)]
+                        else:
+                            return [(0.753, 2), (0.222, 1), (0.015, 7), (0.007, 16), (0.004, 12)]
+                    else:
+                        if context.get_value('n') <= 13888.0:
+                            return [(0.710, 14), (0.275, 21), (0.014, 12)]
+                        else:
+                            return [(0.374, 19), (0.339, 20), (0.106, 21), (0.101, 16), (0.066, 17), (0.009, 14), (0.004, 18)]
+        else:
+            if context.get_value('n') <= 3520.0:
+                if context.get_value('arith_intensity') <= 3.994754433631897:
+                    if str(context.get_value('mat2_dtype')) != 'torch.uint8':
+                        if context.get_value('m*k') <= 18944.0:
+                            return [(0.577, 5), (0.423, 6)]
+                        else:
+                            return [(0.988, 5), (0.012, 6)]
+                    else:
+                        if context.get_value('arith_intensity') <= 2.9899919033050537:
+                            return None
+                        else:
+                            return None
+                else:
+                    if context.get_value('arith_intensity') <= 7.956453561782837:
+                        if context.get_value('k*n') <= 9244032.0:
+                            return [(0.822, 5), (0.178, 6)]
+                        else:
+                            return [(0.977, 5), (0.023, 0)]
+                    else:
+                        if context.get_value('m*k') <= 978944.0:
+                            return [(1.000, 5)]
+                        else:
+                            return [(0.971, 5), (0.029, 0)]
+            else:
+                if context.get_value('n') <= 13632.0:
+                    if context.get_value('n') <= 6976.0:
+                        return [(1.000, 6)]
+                    else:
+                        if context.get_value('k') <= 3968.0:
+                            return [(0.617, 3), (0.111, 5), (0.099, 7), (0.086, 9), (0.062, 6), (0.025, 8)]
+                        else:
+                            return [(0.779, 8), (0.119, 5), (0.053, 7), (0.035, 6), (0.013, 3)]
+                else:
+                    if context.get_value('k*n') <= 39518208.0:
+                        return [(0.385, 4), (0.327, 3), (0.192, 6), (0.038, 7), (0.038, 10), (0.019, 5)]
+                    else:
+                        if context.get_value('n') <= 20800.0:
+                            return [(0.821, 6), (0.121, 7), (0.029, 4), (0.014, 5), (0.007, 3), (0.007, 8)]
+                        else:
+                            return [(0.530, 7), (0.386, 6), (0.046, 8), (0.021, 3), (0.015, 4), (0.002, 5)]

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MixedMMH100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 232448
+            and str(metadata.device_capa) == "(9, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_fallback_mixed_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+    def get_name(self) -> str:
+        return 'mixed_mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 15.988086223602295:
+            if context.get_value('n') <= 25280.0:
+                if context.get_value('n') <= 1344.0:
+                    if context.get_value('mat1_stride_0') <= 7808.0:
+                        return [(0.581, 7), (0.419, 6)]
+                    else:
+                        if context.get_value('m*n') <= 7680.0:
+                            return [(0.875, 0), (0.125, 6)]
+                        else:
+                            return [(0.833, 0), (0.167, 7)]
+                else:
+                    if context.get_value('n') <= 8512.0:
+                        if str(context.get_value('mat2_dtype')) != 'torch.int8':
+                            return [(0.763, 6), (0.237, 7)]
+                        else:
+                            return [(0.725, 7), (0.275, 6)]
+                    else:
+                        if str(context.get_value('mat1_dtype')) != 'torch.bfloat16':
+                            return [(0.736, 7), (0.197, 9), (0.048, 6), (0.014, 8), (0.005, 10)]
+                        else:
+                            return [(0.473, 7), (0.398, 6), (0.097, 9), (0.032, 10)]
+            else:
+                if context.get_value('n') <= 42254.0:
+                    if context.get_value('n') <= 33856.0:
+                        if context.get_value('k*n') <= 68157440.0:
+                            return [(0.370, 4), (0.370, 5), (0.074, 7), (0.074, 8), (0.074, 11), (0.037, 6)]
+                        else:
+                            return [(0.916, 8), (0.036, 7), (0.036, 9), (0.012, 4)]
+                    else:
+                        return [(0.659, 5), (0.341, 6)]
+                else:
+                    if context.get_value('k*n') <= 326052992.0:
+                        if context.get_value('n') <= 55232.0:
+                            return [(0.571, 6), (0.321, 7), (0.036, 4), (0.036, 8), (0.036, 9)]
+                        else:
+                            return [(0.506, 6), (0.325, 8), (0.104, 7), (0.039, 5), (0.026, 9)]
+                    else:
+                        if context.get_value('n') <= 57024.0:
+                            return [(0.462, 9), (0.385, 7), (0.115, 6), (0.038, 8)]
+                        else:
+                            return [(0.598, 8), (0.223, 9), (0.107, 6), (0.071, 7)]
+        else:
+            if context.get_value('m*n') <= 543936.0:
+                if str(context.get_value('17LEQmLEQ32')) != 'True':
+                    if context.get_value('m*n') <= 262272.0:
+                        if context.get_value('n') <= 1592.5:
+                            return [(0.860, 0), (0.140, 9)]
+                        else:
+                            return None
+                    else:
+                        if context.get_value('m*k') <= 1294336.0:
+                            return [(0.833, 17), (0.150, 18), (0.017, 15)]
+                        else:
+                            return [(0.917, 17), (0.083, 8)]
+                else:
+                    if context.get_value('n') <= 12416.0:
+                        if context.get_value('m*n') <= 43008.0:
+                            return None
+                        else:
+                            return [(0.853, 14), (0.147, 9)]
+                    else:
+                        return [(0.625, 12), (0.375, 14)]
+            else:
+                if context.get_value('m') <= 32.5:
+                    if context.get_value('mat2_stride_1') <= 6656.0:
+                        if context.get_value('n') <= 69184.0:
+                            return [(0.611, 12), (0.361, 14), (0.028, 13)]
+                        else:
+                            return [(1.000, 12)]
+                    else:
+                        if context.get_value('mat2_stride_1') <= 20864.0:
+                            return [(1.000, 12)]
+                        else:
+                            return [(0.958, 12), (0.042, 9)]
+                else:
+                    if context.get_value('m*n') <= 1085440.0:
+                        if context.get_value('n') <= 9152.0:
+                            return [(1.000, 18)]
+                        else:
+                            return [(0.780, 18), (0.160, 16), (0.060, 20)]
+                    else:
+                        if context.get_value('m') <= 67.0:
+                            return [(0.650, 16), (0.203, 19), (0.122, 18), (0.016, 20), (0.008, 1)]
+                        else:
+                            return [(0.561, 3), (0.185, 16), (0.096, 20), (0.083, 19), (0.076, 2)]

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/artifacts/_PadMMA100.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/pad_mm/
+from torch._inductor.autoheuristic.autoheuristic_utils import AHContext, AHMetadata, Choice, CHOICE_COL
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicRegression,
+)
+class PadMMA100(LearnedHeuristicRegression):
+    def __init__(self) -> None:
+        pass
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_feedback(self, context: AHContext, choice: Choice) -> float:
+        context.context_dict[CHOICE_COL] = choice
+        return self.predict(context)
+    def get_confidence_threshold(self) -> float:
+        return 1.7025303314066
+    def get_name(self) -> str:
+        return 'pad_mm'
+    def predict(self, context: AHContext) -> float:
+        if str(context.get_value('choice')) != 'pad':
+            if str(context.get_value('using_tf32')) != 'False':
+                if context.get_value('m*n') <= 4171264.0:
+                    if context.get_value('m*k') <= 3999308.0:
+                        return 1.8751469764071178
+                    else:
+                        if str(context.get_value('n_multiple_32')) != 'True':
+                            return 0.9117231355626345
+                        else:
+                            return 1.1607689608873861
+                else:
+                    if str(context.get_value('n_multiple_2')) != 'True':
+                        if str(context.get_value('using_tf32')) != 'True':
+                            return 0.7430382200435992
+                        else:
+                            return 0.8531269794448678
+                    else:
+                        if str(context.get_value('k_multiple_2')) != 'True':
+                            return 0.7577181972719917
+                        else:
+                            return 0.8977349440424219
+            else:
+                if context.get_value('m*n') <= 1299712.0:
+                    return 1.1669723418995592
+                else:
+                    if context.get_value('mat2_stride_1') <= 45217.5:
+                        if context.get_value('m*n') <= 55884158.0:
+                            return 1.0262769936909601
+                        else:
+                            return 1.0022677428470845
+                    else:
+                        if context.get_value('m') <= 18478.0:
+                            return 1.1127066261894312
+                        else:
+                            return 1.0337740659894263
+        else:
+            if str(context.get_value('mat1_dtype')) != 'torch.float32':
+                if str(context.get_value('n_multiple_2')) != 'False':
+                    if str(context.get_value('k_multiple_2')) != 'True':
+                        if context.get_value('mat1_stride_0') <= 561.0:
+                            return 1.2900382135142956
+                        else:
+                            return 1.5761737616057887
+                    else:
+                        if context.get_value('num_dims_needs_padding') <= 1.5:
+                            return 1.0472263310239422
+                        else:
+                            return 1.1727673465762514
+                else:
+                    if context.get_value('k') <= 28238.5:
+                        if context.get_value('k/(m*n)') <= 0.00026227018679492176:
+                            return 1.6770542505397175
+                        else:
+                            return 1.3974785435105923
+                    else:
+                        if str(context.get_value('mat1_dtype')) != 'torch.bfloat16':
+                            return 1.3952699800111992
+                        else:
+                            return 1.5759286511628336
+            else:
+                if str(context.get_value('using_tf32')) != 'False':
+                    if context.get_value('m*n') <= 14119424.0:
+                        return 0.8875772670422478
+                    else:
+                        if str(context.get_value('mat2_innermost_needs_padding')) != 'True':
+                            return 1.1467728924377265
+                        else:
+                            return 1.215842963532998
+                else:
+                    if context.get_value('arith_intensity') <= 396.8774871826172:
+                        return 0.89940161869551
+                    else:
+                        if context.get_value('mat2_stride_1') <= 45217.5:
+                            return 0.9964328169353532
+                        else:
+                            return 0.9493479238294826

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/autoheuristic.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import json
+import os
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional
+import torch
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    AHOperation,
+    Choice,
+    CHOICE_COL,
+    Feedback,
+    FEEDBACK_COL,
+    get_metadata_str_from_log,
+)
+from torch._inductor.autoheuristic.learned_heuristic_controller import (
+    LearnedHeuristicController,
+)
+from torch._inductor.ir import ChoiceCaller
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.utils import get_gpu_shared_memory
+class LocalFeedback:
+    """
+    To be able to collect data for a choice, a function providing feedback given a choice has to be provided.
+    LocalFeedback can be used when AutoHeuristic should immediately run the function to collect feedback for each choice
+    (see pad_mm.py, where the autotuning happens locally, for an example).
+    """
+    def __init__(self, feedback_fn: Callable[[Choice], Feedback]) -> None:
+        self.feedback_fn = feedback_fn
+    def __call__(self, choice: Choice) -> Feedback:
+        return self.feedback_fn(choice)
+class InconsistentMetadata(Exception):
+    """
+    Exception that is thrown when AutoHeuristic tries to log data to a file where the metadata stored in the file does
+    not match the metadata it would store if the file didn't exist.
+    """
+class AutoHeuristic:
+    """
+    AutoHeuristic is a framework that allows one to collect data, learn a heuristic (i.e. a regression tree) and
+    generate the heuristic to code. This class allows one to collect data. The collected data can then be used to train
+    a heuristic (see torchgen/autoheuristic/).
+    """
+    collected_feedback: Dict[Choice, Feedback]
+    def __init__(
+        self,
+        fallback: Callable[[], Choice],
+        choices: List[Choice],
+        feedback: Optional[LocalFeedback],
+        context: AHContext,
+        name: str,
+        augment_context: Optional[List[AHOperation]] = None,
+        precondition: Optional[Callable[[AHMetadata, AHContext], bool]] = None,
+    ) -> None:
+        """
+        Initializes an instance of the AutoHeuristic class.
+        Args:
+            fallback: A callable that returns a Choice when the heuristic is unsure which choice to make, or
+            AutoHeuristic is in data collection mode.
+            choices: A list of possible choices the heuristic can make.
+            feedback: An instance of LocalFeedback that provides feedback for a given choice.
+            context: Context to store with each choice and feedback.
+            name: A string that identifies the heuristic.
+            augment_context: An optional list of AHOperation instances that augment the context.
+            precondition: A callable that returns a boolean indicating whether AutoHeuristic should run.
+        """
+        self.fallback = fallback
+        self.choices = choices
+        self.feedback = feedback
+        self.context = context
+        self.name = name
+        self.collected_feedback = {}
+        self.augment_context = augment_context
+        self.metadata = AHMetadata(
+            get_gpu_shared_memory(),
+            torch.cuda.get_device_capability(),
+            self.choices,
+            self.name,
+        )
+        self.precondition = precondition
+        if not self.satisfies_precondition():
+            return
+        if torch._inductor.config.autoheuristic_log_path == "DEFAULT":
+            self.log_path = self.get_default_log_path()
+        else:
+            self.log_path = torch._inductor.config.autoheuristic_log_path
+        if torch._inductor.config.collect_autoheuristic(self.name):
+            if self.feedback is not None:
+                for choice in self.choices:
+                    feedback_val = self.feedback(choice)
+                    self.save_data(choice, feedback_val)
+    def satisfies_precondition(self) -> bool:
+        return self.precondition is None or self.precondition(
+            self.metadata, self.context
+        )
+    def get_choice(self) -> Choice:
+        """
+        Returns the chosen option based on the value of autoheuristic_use.
+        If self.name is one of the comma separated strings in autoheuristic_use,
+        it queries a learned heuristic to make a decision. Otherwise, it returns the fallback option.
+        """
+        if not self.satisfies_precondition():
+            return self.fallback()
+        if torch._inductor.config.use_autoheuristic(self.name):
+            if self.augment_context is not None:
+                self.context.apply_operations(self.augment_context)
+            controller = LearnedHeuristicController(
+                self.metadata,
+                self.context,
+            )
+            decision = controller.get_decision()
+            if decision not in self.choices:
+                # TODO(AlnisM): We might want to allow this in the future
+                return self.fallback()
+            if decision is not None:
+                return decision
+        return self.fallback()
+    def get_top_k_choices(
+        self, top_k: int, always_included: Optional[List[str]] = None
+    ) -> Optional[List[Choice]]:
+        if not self.satisfies_precondition():
+            return None
+        if torch._inductor.config.use_autoheuristic(self.name):
+            if self.augment_context is not None:
+                self.context.apply_operations(self.augment_context)
+            controller = LearnedHeuristicController(
+                self.metadata,
+                self.context,
+            )
+            choices = controller.get_decisions_ranked(top_k)
+            if choices is None:
+                return None
+            if always_included is not None:
+                for choice in always_included:
+                    if choice not in choices:
+                        choices.append(choice)
+            return choices
+        return None
+    def get_collected_feedback(self, choice: Choice) -> Any:
+        return self.collected_feedback.get(choice, None)
+    @staticmethod
+    def get_device_identifier() -> str:
+        # a heuristic might work well for one GPU, but not for another
+        # we store the collected data per GPU model and learn a heuristic per GPU model
+        # TODO(AlnisM): just using the device name for now, but the same GPU model can have different names
+        device_name = torch.cuda.get_device_name().replace(" ", "_")
+        return device_name
+    def get_default_log_path(self) -> str:
+        device_name = self.get_device_identifier()
+        path = f"{cache_dir()}/autoheuristic/{device_name}/"
+        os.makedirs(path, exist_ok=True)
+        path += f"{self.name}.txt"
+        return path
+    def serialize_metadata(self) -> str:
+        metadata_dict = self.metadata.to_dict()
+        (
+            num_features,
+            cat_features,
+        ) = self.context.get_numerical_and_categorical_features()
+        metadata_dict["numerical_features"] = num_features
+        metadata_dict["categorical_features"] = cat_features
+        return json.dumps(metadata_dict)
+    def save_data(self, choice: Choice, feedback_val: Feedback) -> None:
+        self.collected_feedback[choice] = feedback_val
+        log_path = self.log_path
+        lines = []
+        log_exists = os.path.exists(log_path)
+        if log_exists:
+            # if log already exists, make sure it is consistent
+            metadata = self.serialize_metadata()
+            existing_metadata = get_metadata_str_from_log(self.log_path)
+            if existing_metadata != metadata:
+                raise InconsistentMetadata(
+                    "Given metadata does not match existing metadata"
+                )
+        else:
+            lines.append(self.serialize_metadata())
+            feature_header = self.context.get_feature_names_csv()
+            header = feature_header + "," + CHOICE_COL + "," + FEEDBACK_COL
+            lines.append(header)
+        line = ""
+        feature_values = self.context.get_feature_values_csv()
+        line += feature_values + "," + choice + "," + str(feedback_val)
+        lines.append(line)
+        with open(log_path, "a") as f:
+            f.write("\n".join(lines) + "\n")
+class AutoHeuristicSelectAlgorithm(AutoHeuristic):
+    """
+    AutoHeuristicSelectAlgorithm is a subclass of AutoHeuristic that allows one to collect data and learn a heuristic
+    when one wants to use AutoHeuristic for kernel choice selection.
+    """
+    def __init__(
+        self,
+        fallback: Callable[[], Optional[ChoiceCaller]],
+        choices: List[ChoiceCaller],
+        input_nodes: List[Any],
+        context: AHContext,
+        name: str,
+        augment_context: Optional[List[AHOperation]] = None,
+        precondition: Optional[Callable[[AHMetadata, AHContext], bool]] = None,
+    ) -> None:
+        """
+        The arguments choices, input_nodes and name have to match the ones used in the call to
+        autotune_select_algorithm(), e.g. if the following call is made
+        autotune_select_algorithm(name, choices, input_nodes, layout), the same name, choices and input_nodes
+        have to be used here.
+        """
+        self.input_nodes = input_nodes
+        self.choicestr2choice: Dict[str, ChoiceCaller] = {}
+        for choice in choices:
+            self.choicestr2choice[choice.autoheuristic_id()] = choice
+        choices_str = list(self.choicestr2choice.keys())
+        def fallback_str() -> str:
+            fallback_choice = fallback()
+            if fallback_choice is None:
+                # TODO: Find a nicer way to handle this
+                return "unsure"
+            return fallback_choice.autoheuristic_id()
+        super().__init__(
+            fallback_str,
+            choices_str,
+            None,
+            context,
+            name,
+            augment_context,
+            precondition,
+        )
+        if (
+            torch._inductor.config.collect_autoheuristic(self.name)
+            and self.satisfies_precondition()
+        ):
+            self.register_global_feedback(input_nodes, choices)
+    def register_global_feedback(
+        self, input_nodes: List[Any], choices: List[ChoiceCaller]
+    ) -> None:
+        """
+        Registers a callback in select_algorithm, which is called with the timing of each choice.
+        """
+        from torch._inductor.select_algorithm import (
+            add_feedback_saver,
+            create_inputs_key,
+            create_precompile_key,
+        )
+        def store_global_feedback(
+            ah_inputs_key: str,
+            ah_precompile_key: str,
+            timings: Dict[ChoiceCaller, float],
+            name: str,
+            input_nodes: List[Any],
+            choices: List[ChoiceCaller],
+        ) -> None:
+            current_inputs_key = create_inputs_key(input_nodes)
+            if current_inputs_key != ah_inputs_key:
+                return
+            current_precompile_key = create_precompile_key(
+                name, current_inputs_key, choices
+            )
+            if current_precompile_key != ah_precompile_key:
+                return
+            for choice, time in timings.items():
+                self.save_data(choice.autoheuristic_id(), time)
+        inputs_key = create_inputs_key(input_nodes)
+        precompile_key = create_precompile_key(self.name, inputs_key, choices)
+        feedback_saver = partial(store_global_feedback, inputs_key, precompile_key)
+        add_feedback_saver(feedback_saver)
+    def get_choice_caller(self) -> Optional[ChoiceCaller]:
+        choice = self.get_choice()
+        return self.choicestr2choice.get(choice, None)
+    def get_top_k_choices_caller(
+        self, top_k: int, always_included: Optional[List[str]] = None
+    ) -> Optional[List[ChoiceCaller]]:
+        choices = self.get_top_k_choices(top_k, always_included)
+        if choices is None:
+            return None
+        return [self.choicestr2choice[choice] for choice in choices]

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/autoheuristic_utils.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import functools
+from typing import Any, Callable, Dict, List, Tuple
+import torch
+Feedback = float
+Choice = str
+Value = Any
+CHOICE_COL = "choice"
+FEEDBACK_COL = "feedback"
+class AHFeature:
+    """
+    The context, that AutoHeuristic stores, is a list of features. AutoHeuristic needs to know whether a feature is
+    categorical (i.e., not a continuous variable) to learn a machine learning model.
+    """
+    def __init__(self, name: str, value: Value, is_categorical: bool = False) -> None:
+        self.name = name
+        self.value = value
+        self.is_categorical = is_categorical
+class AHOperation:
+    """
+    AHOperation can be used to augment the data collected by AutoHeuristic.
+    One might for example store features like m, k, n, but also want to use
+    features like m*n, or k*n, to learn a heuristic. Instead of storing features
+    that can be created from the collected data, one can use AHOperation to
+    create new features from the collected data.
+    """
+    def __init__(
+        self, name: str, func: Callable[[Any], Value], is_categorical: bool = False
+    ) -> None:
+        self.name = name
+        self.func = func
+        self.is_categorical = is_categorical
+    def apply_operation(self, data: Any) -> None:
+        data[self.name] = self.func(data)
+class AHContext:
+    """
+    This class is used to specify which information AutoHeuristic should store. For each choice, AutoHeursitic will
+    store the context and the collected feedback. The context could be something like the shape of a tensor, i.e.,
+    information that will help to learn a heuristic.
+    """
+    features: List[AHFeature]
+    context_dict: Dict[str, Value]
+    def __init__(self) -> None:
+        self.features = []
+        self.context_dict = {}
+    def add_feature(
+        self, name: str, value: Value, is_categorical: bool = False
+    ) -> None:
+        self.features.append(AHFeature(name, value, is_categorical=is_categorical))
+        self.context_dict[name] = value
+    def get_numerical_and_categorical_features(self) -> Tuple[List[str], List[str]]:
+        numerical_features = []
+        categorical_features = []
+        for feature in self.features:
+            if feature.is_categorical:
+                categorical_features.append(feature.name)
+            else:
+                numerical_features.append(feature.name)
+        return numerical_features, categorical_features
+    def get_feature_names_csv(self) -> str:
+        return ",".join(feature.name for feature in self.features)
+    def get_feature_values_csv(self) -> str:
+        return ",".join(str(feature.value) for feature in self.features)
+    def get_value(self, name: str) -> Value:
+        return self.context_dict[name]
+    def apply_operations(self, operations: List[AHOperation]) -> None:
+        for op in operations:
+            op.apply_operation(self.context_dict)
+class AHMetadata:
+    def __init__(
+        self,
+        shared_memory: Any,
+        device_capa: Tuple[int, int],
+        choices: List[Choice],
+        name: str,
+    ) -> None:
+        # use amount of shared_memory and device_capability to identify GPU
+        # TODO(AlnisM): there might be a better way to do this
+        self.shared_memory = shared_memory
+        self.device_capa = device_capa
+        self.choices = choices
+        self.name = name
+    def to_dict(self) -> Dict[str, Value]:
+        return {
+            "shared_memory": self.shared_memory,
+            "device_capa": self.device_capa,
+            "name": self.name,
+        }
+def get_metadata_str_from_log(log_path: str) -> str:
+    with open(log_path, newline="") as file:
+        json_string = file.readline().strip()
+        return json_string
+def check_minsize(context: AHContext, minsize: int) -> bool:
+    return (
+        context.get_value("m") >= minsize
+        and context.get_value("k") >= minsize
+        and context.get_value("n") >= minsize
+    )
+def pad_mm_precondition(metadata: AHMetadata, context: AHContext) -> bool:
+    if metadata.shared_memory == 166912 and metadata.device_capa == (8, 0):
+        # A100 precondition
+        return check_minsize(context, 512)
+    elif metadata.shared_memory == 232448 and metadata.device_capa == (9, 0):
+        # H100 precondition
+        return check_minsize(context, 768)
+    return True
+def get_mixedmm_precondition(metadata: AHMetadata, context: AHContext) -> bool:
+    m = context.get_value("m")
+    k = context.get_value("k")
+    n = context.get_value("n")
+    if m > 128 or k < 1024 or n < 1024:
+        return False
+    mat1_iscontig = context.get_value("mat1_iscontig")
+    mat2_iscontig = context.get_value("mat2_iscontig")
+    return mat1_iscontig and not mat2_iscontig
+def get_mult_dims_ops() -> List[AHOperation]:
+    m_times_k_op = AHOperation("m*k", lambda data: data["m"] * data["k"])
+    m_times_n_op = AHOperation("m*n", lambda data: data["m"] * data["n"])
+    k_times_n_op = AHOperation("k*n", lambda data: data["k"] * data["n"])
+    return [m_times_k_op, m_times_n_op, k_times_n_op]
+def get_arith_intensity(data: Any) -> float:
+    m = data["m"]
+    k = data["k"]
+    n = data["n"]
+    if m == 0 or k == 0 or n == 0:
+        return 0.0
+    return m * k * n / (m * k + k * n + m * n)
+def pad_mm_operations() -> List[AHOperation]:
+    mult_dims_ops = get_mult_dims_ops()
+    k_div_m_times_n_op = AHOperation(
+        "k/(m*n)", lambda data: data["k"] / (data["m"] * data["n"])
+    )
+    def bfloat_perf_hit(data: Any) -> bool:
+        m = data["m"]
+        k = data["k"]
+        n = data["n"]
+        is_bfloat = str(data["mat1_dtype"]) == "torch.bfloat16"
+        return k > (m * 1024) and k > (n * 1024) and is_bfloat
+    bfloat_perf_hit_op = AHOperation(
+        "bfloat_perf_hit", bfloat_perf_hit, is_categorical=True
+    )
+    arith_intensity_op = AHOperation("arith_intensity", get_arith_intensity)
+    dims_need_padding_ops = get_dims_need_padding_ops()
+    dims_multiple_ops = get_dims_multiple_ops()
+    is_contig_ops = get_is_contig_ops()
+    ah_operations = mult_dims_ops + [
+        k_div_m_times_n_op,
+        bfloat_perf_hit_op,
+        arith_intensity_op,
+    ]
+    ah_operations.extend(dims_need_padding_ops)
+    ah_operations.extend(dims_multiple_ops)
+    ah_operations.extend(is_contig_ops)
+    return ah_operations
+def between_op(data: Any, dim: str, lower: int, upper: int) -> bool:
+    return data[dim] >= lower and data[dim] <= upper
+def between_ops() -> List[AHOperation]:
+    dims = ["m", "k", "n"]
+    limits = [(1, 16), (17, 32), (33, 64), (65, 128), (129, 256)]
+    ah_operations = []
+    for dim in dims:
+        for lower, upper in limits:
+            between_op_fn = functools.partial(
+                between_op, dim=dim, lower=lower, upper=upper
+            )
+            # using 'LEQ' instead of '<=' because '<=' cannot be exported to dot
+            between_op_name = f"{lower}LEQ{dim}LEQ{upper}"
+            ah_operations.append(
+                AHOperation(between_op_name, between_op_fn, is_categorical=True)
+            )
+    return ah_operations
+def pow2_op(data: Any, dim: str, exponent: int) -> bool:
+    return data[dim] == 2**exponent
+def mm_operations() -> List[AHOperation]:
+    mult_dims_ops = get_mult_dims_ops()
+    arith_intensity_op = AHOperation("arith_intensity", get_arith_intensity)
+    return mult_dims_ops + [arith_intensity_op]
+def mixed_mm_operations() -> List[AHOperation]:
+    return mm_operations() + between_ops()
+def is_multiple(data: Any, dim: str, mult: int) -> bool:
+    return data[dim] % mult == 0
+def get_dims_multiple_ops() -> List[AHOperation]:
+    multiples = [2, 4, 8, 16, 32]
+    dims = ["m", "k", "n"]
+    dims_multiple_ops = []
+    for dim in dims:
+        for mult in multiples:
+            is_multiple_fn = functools.partial(is_multiple, dim=dim, mult=mult)
+            dims_multiple_op = AHOperation(
+                f"{dim}_multiple_{mult}", is_multiple_fn, is_categorical=True
+            )
+            dims_multiple_ops.append(dims_multiple_op)
+    return dims_multiple_ops
+def get_dims_need_padding_ops() -> List[AHOperation]:
+    def mat1_innermost_needs_padding_fn(data: Any) -> bool:
+        mat1_stride_0 = data["mat1_stride_0"]
+        mat1_stride_1 = data["mat1_stride_1"]
+        m_padded_length = data["m_padded_length"]
+        k_padded_length = data["k_padded_length"]
+        mat1_innermost_needs_padding = False
+        if mat1_stride_0 == 1 and m_padded_length != 0:
+            mat1_innermost_needs_padding = True
+        if mat1_stride_1 == 1 and k_padded_length != 0:
+            mat1_innermost_needs_padding = True
+        return mat1_innermost_needs_padding
+    mat1_innermost_op = AHOperation(
+        "mat1_innermost_needs_padding",
+        mat1_innermost_needs_padding_fn,
+        is_categorical=True,
+    )
+    def mat2_innermost_needs_padding_fn(data: Any) -> bool:
+        mat2_stride_0 = data["mat2_stride_0"]
+        mat2_stride_1 = data["mat2_stride_1"]
+        k_padded_length = data["k_padded_length"]
+        n_padded_length = data["n_padded_length"]
+        mat2_innermost_needs_padding = False
+        if mat2_stride_0 == 1 and k_padded_length != 0:
+            mat2_innermost_needs_padding = True
+        if mat2_stride_1 == 1 and n_padded_length != 0:
+            mat2_innermost_needs_padding = True
+        return mat2_innermost_needs_padding
+    mat2_innermost_op = AHOperation(
+        "mat2_innermost_needs_padding",
+        mat2_innermost_needs_padding_fn,
+        is_categorical=True,
+    )
+    def num_dims_needs_padding_fn(data: Any) -> int:
+        m_padded_length = data["m_padded_length"]
+        k_padded_length = data["k_padded_length"]
+        n_padded_length = data["n_padded_length"]
+        num_dims_needs_padding = 0
+        if m_padded_length != 0:
+            num_dims_needs_padding += 1
+        if k_padded_length != 0:
+            num_dims_needs_padding += 1
+        if n_padded_length != 0:
+            num_dims_needs_padding += 1
+        return num_dims_needs_padding
+    num_dims_op = AHOperation("num_dims_needs_padding", num_dims_needs_padding_fn)
+    return [mat1_innermost_op, mat2_innermost_op, num_dims_op]
+def get_is_contig_ops() -> List[AHOperation]:
+    def mat1_is_contig_fn(data: Any) -> bool:
+        stride_0 = data["mat1_stride_0"]
+        stride_1 = data["mat1_stride_1"]
+        k = data["k"]
+        return stride_0 == k and stride_1 == 1
+    mat1_is_contig_op = AHOperation(
+        "mat1_iscontig", mat1_is_contig_fn, is_categorical=True
+    )
+    def mat2_is_contig_fn(data: Any) -> bool:
+        stride_0 = data["mat2_stride_0"]
+        stride_1 = data["mat2_stride_1"]
+        n = data["n"]
+        return stride_0 == n and stride_1 == 1
+    mat2_is_contig_op = AHOperation(
+        "mat2_iscontig", mat2_is_contig_fn, is_categorical=True
+    )
+    return [mat1_is_contig_op, mat2_is_contig_op]
+def context_add_strides(context: AHContext, name: str, stride: Tuple[int, ...]) -> None:
+    for i, s in enumerate(stride):
+        context.add_feature(f"{name}_stride_{i}", s)
+def context_add_using_tf32(context: AHContext, dtype: torch.dtype) -> None:
+    using_tf32 = "not_float_32"
+    if dtype == torch.float32:
+        using_tf32 = torch.backends.cuda.matmul.allow_tf32
+    context.add_feature("using_tf32", using_tf32, is_categorical=True)

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/learned_heuristic_controller.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import importlib
+import inspect
+import pkgutil
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import LearnedHeuristic
+def find_and_instantiate_subclasses(
+    package_name: str, base_class: Any
+) -> List[LearnedHeuristic]:
+    instances = []
+    package = importlib.import_module(package_name)
+    for _, module_name, _ in pkgutil.walk_packages(
+        package.__path__, package.__name__ + "."
+    ):
+        try:
+            module_basename = module_name.split(".")[-1]
+            if not module_basename.startswith("_"):
+                # learned heuristics start with an underscore
+                continue
+            module = importlib.import_module(module_name)
+            # look for classes that are subclasses of base_class
+            for name, obj in inspect.getmembers(module):
+                if (
+                    inspect.isclass(obj)
+                    and issubclass(obj, base_class)
+                    and obj != base_class
+                ):
+                    instance = obj()
+                    instances.append(instance)
+        except Exception as e:
+            print(f"Error processing module {module_name}: {e}")
+    return instances
+class LearnedHeuristicController:
+    """
+    Class that finds and instantiates all learned heuristics. It also provides
+    a way to get the decision of a learned heuristic.
+    """
+    existing_heuristics: Dict[str, List[LearnedHeuristic]] = defaultdict(list)
+    """
+    A dictionary that stores all the learned heuristics for each optimization.
+    The key is the optimization name, and the value is a list of LearnedHeuristic objects.
+    """
+    heuristics_initialized: bool = False
+    """
+    A flag that indicates whether the learned heuristics have been initialized.
+    Set to true when the get_decision() function is called for the first time.
+    """
+    def __init__(
+        self,
+        metadata: AHMetadata,
+        context: AHContext,
+    ) -> None:
+        self.metadata = metadata
+        self.context = context
+    def get_heuristics(self, name: str) -> List[LearnedHeuristic]:
+        """
+        Returns a list of learned heuristics for the given optimization name.
+        """
+        if not LearnedHeuristicController.heuristics_initialized:
+            # learned heuristics are generated into the following package
+            learned_heuristics_package = "torch._inductor.autoheuristic.artifacts"
+            # learned heuristics have to be of type LearnedHeuristic
+            base_class = LearnedHeuristic
+            found_heuristics = find_and_instantiate_subclasses(
+                learned_heuristics_package, base_class
+            )
+            for learned_heuristic in found_heuristics:
+                opt_name = learned_heuristic.get_name()
+                LearnedHeuristicController.existing_heuristics[opt_name].append(
+                    learned_heuristic
+                )
+            LearnedHeuristicController.heuristics_initialized = True
+        return LearnedHeuristicController.existing_heuristics[name]
+    def get_decision(self) -> Optional[Choice]:
+        """
+        Returns the decision made by the learned heuristic or None if no heuristic was found or the heuristic is unsure
+        which choice to make.
+        """
+        heuristics = self.get_heuristics(self.metadata.name)
+        for heuristic in heuristics:
+            if heuristic.check_precondition(self.metadata, self.context):
+                return heuristic.get_decision(self.context, self.metadata.choices)
+        return None
+    def get_decisions_ranked(self, top_k: int) -> Optional[List[Choice]]:
+        heuristics = self.get_heuristics(self.metadata.name)
+        for heuristic in heuristics:
+            if heuristic.check_precondition(self.metadata, self.context):
+                choices = heuristic.get_decisions_ranked(self.context)
+                if choices is None:
+                    return None
+                avail_choices = [
+                    choice for choice in choices if choice in self.metadata.choices
+                ]
+                return avail_choices[:top_k]
+        return None

.venv/lib/python3.11/site-packages/torch/_inductor/autoheuristic/learnedheuristic_interface.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+class LearnedHeuristic:
+    """
+    LearnedHeuristic is a base class for all learned heuristics.
+    """
+    def __init__(self) -> None:
+        pass
+    def check_precondition(
+        self,
+        metadata: AHMetadata,
+        context: AHContext,
+    ) -> bool:
+        return True
+    def get_decision(
+        self, context: AHContext, choices: List[Choice]
+    ) -> Optional[Choice]:
+        return None
+    def get_confidence_threshold(self) -> float:
+        return 1.0
+    def get_name(self) -> str:
+        return ""
+    def get_decisions_ranked(self, context: AHContext) -> Optional[List[str]]:
+        return None
+class LearnedHeuristicRegression(LearnedHeuristic):
+    def __init__(self) -> None:
+        super().__init__()
+    def get_feedback(self, context: AHContext, choice: Choice) -> float:
+        return 1.0
+    def get_decision(
+        self, context: AHContext, choices: List[Choice]
+    ) -> Optional[Choice]:
+        choice2feedback = {}
+        for choice in choices:
+            predicted_feedback = self.get_feedback(context, choice)
+            choice2feedback[choice] = predicted_feedback
+        sorted_choices_feedback = sorted(choice2feedback.items(), key=lambda t: t[1])
+        highest_feedback = sorted_choices_feedback[-1][1]
+        second_highest_feedback = sorted_choices_feedback[-2][1]
+        if highest_feedback / second_highest_feedback > self.get_confidence_threshold():
+            return sorted_choices_feedback[-1][0]
+        # We are not sure which choice is the best one
+        return None
+class LearnedHeuristicDecision(LearnedHeuristic):
+    def __init__(self) -> None:
+        super().__init__()
+    def get_choice(self, idx: int) -> Optional[str]:
+        return None
+    def get_decision(
+        self, context: AHContext, choices: List[Choice]
+    ) -> Optional[Choice]:
+        best_choices = self.get_best_choices(context)
+        if not best_choices:
+            return None
+        (best_choice_proba, best_choice_idx) = best_choices[0]
+        if best_choice_proba <= self.get_confidence_threshold():
+            return None
+        return self.get_choice(best_choice_idx)
+    def get_decisions_ranked(self, context: AHContext) -> Optional[List[str]]:
+        feedback_idx_list = self.get_best_choices(context)
+        if feedback_idx_list is None:
+            return None
+        choices = [
+            self.get_choice(feedback_idx[1]) for feedback_idx in feedback_idx_list
+        ]
+        choices = [choice for choice in choices if choice is not None]
+        return choices
+    def get_best_choices(self, context: AHContext) -> Optional[List[Tuple[float, int]]]:
+        return []

.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py ADDED Viewed

	@@ -0,0 +1,876 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import contextlib
+import ctypes
+import dataclasses
+import functools
+import logging
+import os
+import queue
+import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from ctypes import byref, c_size_t, c_void_p, CDLL
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Union,
+)
+import torch
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+from torch import multiprocessing
+from torch._dynamo.testing import rand_strided
+from torch._inductor import ir
+from torch._inductor.codecache import (
+    CppCodeCache,
+    CUDACodeCache,
+    DLLWrapper,
+    get_hash,
+    PyCodeCache,
+)
+if TYPE_CHECKING:
+    from multiprocessing.process import BaseProcess
+    from multiprocessing.queues import Queue
+    from types import ModuleType
+    from torch._inductor.select_algorithm import TritonTemplateCaller
+from . import config
+from .runtime.benchmarking import benchmarker
+from .virtualized import V
+CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+EXIT_HANDLER_REGISTERED = False
+log = logging.getLogger(__name__)
+# Used to synchronize between parent and child processes
+class Ping:
+    pass
+class Pong:
+    pass
+class NonzeroWorkspaceNotSupportedError(Exception):
+    pass
+@contextlib.contextmanager
+def set_cuda_visible_device(device: Optional[int]):
+    """
+    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
+    specified single device. If device is None, don't manipulate the environment.
+    """
+    if device is None:
+        yield
+        return
+    current = os.environ.get(CUDA_VISIBLE_DEVICES)
+    os.environ[CUDA_VISIBLE_DEVICES] = str(device)
+    try:
+        yield
+    finally:
+        if current is None:
+            del os.environ[CUDA_VISIBLE_DEVICES]
+        else:
+            os.environ[CUDA_VISIBLE_DEVICES] = current
+@dataclasses.dataclass
+class TuningProcess:
+    """
+    Abstraction for launching a helper process to benchmark kernels. Spawns
+    the parent process and uses multiprocessing queues to send benchmark
+    requests and return results.
+    """
+    device: Optional[int] = None
+    process: Optional[BaseProcess] = None
+    request_queue: Optional[Queue[Any]] = None
+    response_queue: Optional[Queue[Any]] = None
+    @staticmethod
+    def process_main(
+        request_queue: Queue[Any],
+        response_queue: Queue[Any],
+    ) -> None:
+        """
+        Entry point for the child process.
+        """
+        log.debug(
+            "Entering TuningProcess child. Visible devices = %s",
+            os.environ.get(CUDA_VISIBLE_DEVICES),
+        )
+        try:
+            TuningProcess.workloop(request_queue, response_queue)
+        except Exception as ex:
+            log.exception("Exception in TuningProcess")
+    @staticmethod
+    def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
+        """
+        Work loop for the benchmarking subprocess.
+        """
+        while True:
+            obj = request_queue.get()
+            if obj is None:
+                break  # None is a sentinel for the child to terminate
+            elif isinstance(obj, Ping):
+                response_queue.put(Pong())
+            elif isinstance(obj, BenchmarkRequest):
+                response_queue.put(obj.benchmark())
+            else:
+                raise RuntimeError(f"Invalid request type {type(obj)}")
+    def valid(self) -> bool:
+        """
+        True if the sub-process has been initialized.
+        """
+        return (
+            self.process is not None
+            and self.request_queue is not None
+            and self.response_queue is not None
+        )
+    def clear(self) -> None:
+        """
+        Reset to an uninitialized state.
+        """
+        self.process = self.request_queue = self.response_queue = None
+    def initialize(self) -> None:
+        """
+        Create child process, request/response queues, and do the warm up.
+        Set the environment to make only the provided GPU device visible
+        to the process.
+        """
+        if self.valid():
+            return
+        # cuda runtime does not work with "fork", use "spawn" to start processes.
+        ctx = multiprocessing.get_context("spawn")
+        self.request_queue = ctx.Queue()
+        self.response_queue = ctx.Queue()
+        self.process = ctx.Process(
+            target=self.process_main,
+            args=(
+                self.request_queue,
+                self.response_queue,
+            ),
+        )
+        assert self.process is not None
+        with set_cuda_visible_device(self.device):
+            self.process.start()
+    def put(self, obj: Any) -> None:
+        """
+        Push a work item to the child process.
+        """
+        # In case of a prior crash, ensure the subprocess is running
+        self.initialize()
+        assert self.request_queue is not None
+        self.request_queue.put(obj)
+    def get(
+        self, result_timeout=120.0, graceful_timeout=3.0, terminate_timeout=1.0
+    ) -> Any:
+        """
+        Get a response from the child process. Raises queue.Empty on timeout
+        or if the process dies.
+        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
+        to populate the timeouts:
+        Arguments:
+            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
+                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
+            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
+                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
+            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
+                                      remains alive. Defaults to 1.0 or to
+                                      config.max_autotune_subproc_terminate_timeout_seconds.
+        Returns:
+            A response from the child process (Any type)
+        """
+        assert self.process is not None
+        assert self.response_queue is not None
+        while True:
+            try:
+                remaining_timeout = result_timeout
+                res = None
+                while remaining_timeout is not None and remaining_timeout >= 1.0:
+                    remaining_timeout -= 0.5
+                    try:
+                        res = self.response_queue.get(timeout=0.5)
+                        break
+                    except queue.Empty:
+                        if not self.process.is_alive():
+                            raise  # is being caught a few lines below
+                if res is None:
+                    res = self.response_queue.get(timeout=remaining_timeout)
+                return res
+            except queue.Empty:
+                status = self.process.exitcode
+                if status is None:
+                    self.kill(
+                        graceful_timeout=graceful_timeout,
+                        terminate_timeout=terminate_timeout,
+                    )
+                else:
+                    # child process crashed
+                    self.clear()
+                raise
+    def terminate(self) -> None:
+        """
+        Signal the child process to terminate.
+        """
+        if self.valid():
+            assert self.process is not None
+            assert self.request_queue is not None
+            self.request_queue.put(None)
+    def wait(self) -> None:
+        """
+        Wait for the child process to exit.
+        """
+        if self.process is not None:
+            self.process.join()
+            self.clear()
+    def kill(self, graceful_timeout=5.0, terminate_timeout=1.0) -> None:
+        # Tries to kill the process, using a graceful_timeout in which the process
+        # is allowed to exit gracefully. If the process is still alive,
+        # it will be terminated. If that is not sufficient to end it
+        # within terminate_timeout seconds, it will be killed.
+        if self.process is not None:
+            self.terminate()
+            self.process.join(timeout=graceful_timeout)
+            if self.process.is_alive():
+                log.warning(
+                    "Sending SIGTERM to process with PID %d",
+                    self.process.pid,
+                )
+                self.process.terminate()
+                self.process.join(timeout=terminate_timeout)
+                if self.process.is_alive():
+                    log.error(
+                        "Sending SIGKILL to process with PID %d",
+                        self.process.pid,
+                    )
+                    self.process.kill()  # This should definitely end the process
+            self.clear()
+@dataclasses.dataclass
+class TuningProcessPool:
+    """
+    Maintains a pool of TuningProcesses to benchmark kernels in parallel
+    across devices. By default, we create one TuningProcess per device and
+    set the sub-process environment to make only that device visible.
+    """
+    processes: Optional[queue.Queue[TuningProcess]] = None
+    executor: Optional[ThreadPoolExecutor] = None
+    def initialize(self) -> None:
+        """
+        Start the child processes.
+        """
+        assert (self.processes is None) == (self.executor is None)
+        if self.processes is not None:
+            return
+        devices = self.get_device_list()
+        log.debug("Sub-process autotune device list: %s", devices)
+        # Launch the child processes and push a msg to "warm up"
+        self.processes = queue.Queue()
+        for device in devices:
+            p = TuningProcess(device=device)
+            p.initialize()
+            p.put(Ping())
+            self.processes.put(p)
+        # Wait for the initialization to finish
+        for p in self.processes.queue:
+            assert isinstance(p.get(result_timeout=None), Pong)
+        # Use a thread pool to manage distributing work to the subprocesses.
+        # Threads block on an available process, so it makes sense to match
+        # the number of threads with the number of devices.
+        self.executor = ThreadPoolExecutor(max_workers=len(devices))
+        # Register the exit handler for the parent process so it will terminate
+        # the child processes.
+        global EXIT_HANDLER_REGISTERED
+        if not EXIT_HANDLER_REGISTERED:
+            EXIT_HANDLER_REGISTERED = True
+            import atexit
+            atexit.register(self.terminate)
+    def get_device_list(self) -> Sequence[Optional[int]]:
+        """
+        Gather the list of devices to be used in the pool.
+        """
+        if not config.autotune_multi_device:
+            # Don't use multiple devices
+            return [None]
+        count = torch.cuda.device_count()
+        # If the user specified the visible devices in the env, use those.
+        if CUDA_VISIBLE_DEVICES in os.environ:
+            devices = [int(d) for d in os.environ[CUDA_VISIBLE_DEVICES].split(",")]
+            assert len(devices) <= count
+            return devices
+        return list(range(count))
+    def terminate(self) -> None:
+        """
+        Signal all child processes to terminate.
+        """
+        if self.executor is not None:
+            self.executor.shutdown()
+            self.executor = None
+        if self.processes is not None:
+            for p in self.processes.queue:
+                p.terminate()
+            for p in self.processes.queue:
+                p.wait()
+            self.processes = None
+    def target(self, choice: TritonTemplateCaller) -> float:
+        """
+        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
+        remove it from the queue, execute the benchmark in that subprocess, and return
+        the TuningProcess to the queue.
+        """
+        assert choice.bmreq is not None
+        assert self.processes is not None
+        process = self.processes.get()
+        process.put(choice.bmreq)
+        try:
+            return process.get(
+                config.max_autotune_subproc_result_timeout_seconds,
+                config.max_autotune_subproc_graceful_timeout_seconds,
+                config.max_autotune_subproc_terminate_timeout_seconds,
+            )
+        except queue.Empty:
+            warnings.warn(
+                f"Failed to benchmark choice '{choice}'. It will be ignored. "
+                "Please debug the root cause in case the choice can bring perf gains."
+            )
+            # set to INF so this choice will be ignored
+            return float("inf")
+        finally:
+            self.processes.put(process)
+    def benchmark(
+        self,
+        choices: List[TritonTemplateCaller],
+    ) -> Dict[TritonTemplateCaller, float]:
+        """
+        Benchmark each choice in a separate process.
+        """
+        assert self.processes is not None, "Tuning process pool is not initialized"
+        assert self.executor is not None
+        results = {}
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
+        # to grab subprocesses as soon as they're free.
+        for choice, result in zip(choices, self.executor.map(self.target, choices)):
+            results[choice] = result
+        return results
+tuning_pool = TuningProcessPool()
+LayoutOrBuffer = Union[ir.Layout, ir.Buffer]
+@dataclasses.dataclass
+class TensorMeta:
+    device: torch.device
+    dtype: torch.dtype
+    sizes: torch._prims_common.ShapeType
+    strides: torch._prims_common.StrideType
+    offset: int
+    name: Optional[str] = None
+    @classmethod
+    def from_irnodes(
+        cls, irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]
+    ) -> Union[TensorMeta, List[TensorMeta]]:
+        if isinstance(irnodes, Sequence):
+            result: List[Any] = [cls.from_irnodes(x) for x in irnodes]
+            assert all(isinstance(x, TensorMeta) for x in result)
+            return result
+        node = irnodes
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+        dtype = node.get_dtype()
+        assert dtype is not None
+        return TensorMeta(
+            device=node.get_device(),
+            dtype=dtype,
+            sizes=V.graph.sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            strides=V.graph.sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            offset=V.graph.sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+            name=node.get_name(),
+        )
+    def to_tensor(self) -> torch.Tensor:
+        return rand_strided(
+            self.sizes,
+            self.strides,
+            device=self.device,
+            dtype=self.dtype,
+            extra_size=self.offset,
+        )
+@dataclasses.dataclass
+class BenchmarkRequest:
+    """
+    Only handle triton template benchmark for now. The extern kernel benchmark
+    can be done inside the same process since they usually don't cause crash.
+    Important: Instances of this class and subclasses have to be serializable
+    across process boundaries. Do not put CUDA Tensors in here!
+    """
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+    ) -> None:
+        # the kernel name defined in the module
+        self.kernel_name = kernel_name
+        if isinstance(input_tensor_meta, TensorMeta):
+            input_tensor_meta = [input_tensor_meta]
+        self.input_tensor_meta = input_tensor_meta
+        if isinstance(output_tensor_meta, (tuple, list)):
+            assert len(output_tensor_meta) == 1
+            output_tensor_meta = output_tensor_meta[0]
+        self.output_tensor_meta = output_tensor_meta
+        self.extra_args = extra_args
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        raise NotImplementedError
+    def cleanup_run_fn(self) -> None:
+        pass
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        raise NotImplementedError
+    def benchmark(
+        self,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        debug = log.isEnabledFor(logging.DEBUG)
+        if debug:
+            start_ts = time.time()
+        # create args and out tensor
+        if output_tensor is None:
+            assert len(input_tensors) == 0
+            input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
+            output_tensor = self.output_tensor_meta.to_tensor()
+        if debug:
+            create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+        try:
+            fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+        except NonzeroWorkspaceNotSupportedError:
+            # Skipping all ops with nonzero workspace requirements
+            log.info("Skipping op due to nonzero workspace requirement")
+            return float("inf")
+        if debug:
+            load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+        out = self.do_bench(fn, *input_tensors, output_tensor)
+        if debug:
+            bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            log.debug(
+                "InChildProcess %s: load %f, create tensor %f, bench %f",
+                str(self),
+                load_elapse,  # type: ignore[possibly-undefined]
+                create_tensor_elapse,  # type: ignore[possibly-undefined]
+                bench_elapse,
+            )
+        self.cleanup_run_fn()
+        return out
+class TestBenchmarkRequest(BenchmarkRequest):
+    """
+    Supports unit testing. Defined in this file so that the TuningProcess
+    sub-process knows how to unpickle these objects.
+    """
+    def __init__(self, value: Optional[float] = None) -> None:
+        self.value = value
+    def benchmark(
+        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
+    ) -> float:
+        if self.value is None:
+            raise Exception("Failed to run")  # noqa: TRY002
+        return self.value
+class GPUDeviceBenchmarkRequest(BenchmarkRequest):
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        device_idx_set = {
+            tensor.device.index
+            for tensor in [*input_tensors, output_tensor]
+            if isinstance(tensor, torch.Tensor)
+            and tensor.is_cuda
+            and tensor.device.index is not None
+        }
+        assert len(device_idx_set) <= 1, f"Can not mix devices {device_idx_set}"
+        if len(device_idx_set) == 1:
+            device_idx = next(iter(device_idx_set))
+        else:
+            device_idx = torch.cuda.current_device()
+        with torch.cuda.device(device_idx):
+            out = benchmarker.benchmark_gpu(fn)
+            torch.cuda.synchronize()  # shake out any CUDA errors
+        return out
+class TritonBenchmarkRequest(GPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        module_path: str,  # the path of the module defining the triton kernel
+        module_cache_key: str,
+        grid: List[int],
+        num_stages: int,
+        num_warps: int,
+        matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
+    ) -> None:
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.module_path = module_path
+        self.module_cache_key = module_cache_key
+        self.grid = grid
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.matrix_instr_nonkdim = matrix_instr_nonkdim
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+        log.debug(
+            "benchmark module key: %s, path: %s",
+            self.module_cache_key,
+            self.module_path,
+        )
+        run_method = getattr(mod, self.kernel_name).run
+        extra_args = list(self.extra_args)
+        # Newer version of triton add warmup argument to JITFunction.run.
+        # This code handles backward-compatibility.
+        warmup_arg = {}
+        import inspect
+        if "warmup" in inspect.signature(run_method).parameters:
+            warmup_arg["warmup"] = False
+        from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+        if torch.version.hip and self.matrix_instr_nonkdim != 0:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                stream=get_raw_stream(self.output_tensor_meta.device.index),
+            )
+        else:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                stream=get_raw_stream(self.output_tensor_meta.device.index),
+            )
+    def precompile(self):
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+        getattr(mod, self.kernel_name).precompile()
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}"
+class CUDABenchmarkRequest(GPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        source_code: str,
+    ) -> None:
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.source_code = source_code
+        self.workspace_size: int = 0
+        self.workspace: Optional[torch.Tensor] = None
+        self.DLL: Optional[DLLWrapper] = None
+        self._workspace_size_updated = False
+        self.hash_key: str = ""
+        self.source_file: str = ""
+        self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
+    def precompile(self):
+        # Prepopulate CUDACodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CUDACodeCache.compile(self.source_code, "so")
+        log.debug("Done precompiling %s", self)
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        self.ensure_dll_loaded()
+        self.update_workspace_size()
+        args = [
+            c_void_p(tensor.data_ptr())
+            for tensor in list(input_tensors) + [output_tensor]
+        ]
+        log.debug(
+            "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
+            self.kernel_name,
+            self.source_file,
+            self.hash_key,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
+        run_method = getattr(self.DLL, self.kernel_name)
+        workspace_ptr = c_void_p(0)
+        if self.workspace_size > 0:
+            self.workspace = torch.zeros(
+                (self.workspace_size + 7) // 8,
+                dtype=torch.float64,
+                device=output_tensor.device,
+            )
+            workspace_ptr = c_void_p(self.workspace.data_ptr())
+        # Generate partial function.
+        return functools.partial(
+            run_method,
+            *args,
+            *self.extra_args,
+            None,  # null workspace size ptr
+            workspace_ptr,  # set workspace ptr,
+            stream_ptr,
+        )
+    def update_workspace_size(self) -> None:
+        if self._workspace_size_updated:
+            return
+        self.ensure_dll_loaded()
+        unique_input_count = len({meta.name for meta in self.input_tensor_meta})
+        args = [c_void_p(None) for _ in range(unique_input_count + 1)]
+        stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
+        run_method = getattr(self.DLL, self.kernel_name)
+        # Retrieve workspace_size and initialize workspace.
+        c_workspace_size = c_size_t()
+        run_method(
+            *args,  # input ptrs and output ptrs
+            *self.extra_args,
+            byref(
+                c_workspace_size
+            ),  # set workspace size ptr to retrieve workspace size
+            None,  # null workspace ptr
+            stream_ptr,
+        )
+        torch.cuda.synchronize()  # shake out any CUDA errors
+        self.workspace_size = c_workspace_size.value
+        log.debug(
+            "update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",  # noqa: B950
+            self.workspace_size,
+            self.kernel_name,
+            self.source_file,
+            self.hash_key,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        self._workspace_size_updated = True
+    def ensure_dll_loaded(self):
+        if self.DLL is None:
+            self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
+                self.source_code, "so"
+            )
+    def cleanup_run_fn(self) -> None:
+        if self.DLL is not None:
+            self.DLL.close()
+        self.workspace = None
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+class CPUDeviceBenchmarkRequest(BenchmarkRequest):
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        return benchmarker.benchmark_cpu(fn)
+class CppBenchmarkRequest(CPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put Tensors in here!
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        source_code: str,
+    ) -> None:
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.source_code = source_code
+        self.hash_key = get_hash(source_code)
+        self.DLL: Optional[Union[CDLL, ModuleType]] = None
+    def precompile(self):
+        # Prepopulate CppCodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CppCodeCache.load(self.source_code, cuda=False)
+        log.debug("Done precompiling %s", self)
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
+        self.DLL = CppCodeCache.load(self.source_code, cuda=False)
+        args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
+        log.debug(
+            "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
+            self.kernel_name,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        run_method = getattr(self.DLL, self.kernel_name)
+        # Assume only size with type ctypes.c_ulonglong in extra_args
+        assert all(isinstance(arg, ctypes.c_ulonglong) for arg in self.extra_args)
+        run_method.argtypes = [ctypes.c_ulonglong] * (
+            len(args) + len(list(self.extra_args))
+        )
+        # Generate partial function.
+        return functools.partial(
+            run_method,
+            *args,
+            *self.extra_args,
+        )
+    def cleanup_run_fn(self) -> None:
+        if self.DLL is not None:
+            """
+            Check close attr due to it crash on Windows.
+            """
+            if hasattr(self.DLL, "close"):
+                self.DLL.close()
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}"
+def benchmark_in_sub_process(
+    choices: List[TritonTemplateCaller],
+) -> Dict[TritonTemplateCaller, float]:
+    """
+    Do benchmarking in a subprocess and return the perf number (latency).
+    """
+    return tuning_pool.benchmark(choices)

.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/torch/_inductor/comm_analysis.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import functools
+import math
+from enum import IntEnum
+import sympy
+import torch
+from . import ir
+from .utils import get_dtype_size, sympy_product
+from .virtualized import V
+class NCCL_COLL(IntEnum):
+    ALL_REDUCE = 0
+    ALL_GATHER = 1
+    REDUCE_SCATTER = 2
+class NVIDIA_GPU_TYPE(IntEnum):
+    VOLTA = 0
+    AMPERE = 1
+    HOPPER = 2
+@functools.lru_cache
+def get_gpu_type() -> NVIDIA_GPU_TYPE:
+    gpu_info = torch.utils.collect_env.get_gpu_info(torch.utils.collect_env.run) or ""
+    if "V100" in gpu_info:
+        return NVIDIA_GPU_TYPE.VOLTA
+    elif "A100" in gpu_info:
+        return NVIDIA_GPU_TYPE.AMPERE
+    elif "H100" in gpu_info:
+        return NVIDIA_GPU_TYPE.HOPPER
+    else:
+        # for other gpu types, assume Ampere
+        return NVIDIA_GPU_TYPE.AMPERE
+def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
+    if not isinstance(node, ir._CollectiveKernel):
+        raise ValueError(f"node is not a collective kernel: {node}")
+    kernel_name = node.python_kernel_name
+    assert kernel_name is not None
+    if "all_reduce" in kernel_name:
+        return NCCL_COLL.ALL_REDUCE
+    elif "all_gather" in kernel_name:
+        return NCCL_COLL.ALL_GATHER
+    elif "reduce_scatter" in kernel_name:
+        return NCCL_COLL.REDUCE_SCATTER
+    else:
+        raise ValueError(f"Unsupported collective kernel: {kernel_name}")
+def get_collective_input_size_bytes(node: ir.IRNode) -> int:
+    sz_bytes = 0
+    for inp in node.inputs:  # type: ignore[attr-defined]
+        numel = sympy_product(inp.layout.size)
+        if isinstance(numel, sympy.Integer):
+            # For ease of testing
+            numel = int(numel)
+        else:
+            numel = V.graph.sizevars.size_hint(numel, fallback=0)
+        sz_bytes += numel * get_dtype_size(inp.layout.dtype)
+    return sz_bytes
+def get_collective_group_size(node: ir.IRNode) -> int:
+    if type(node) == ir._CollectiveKernel:
+        from torch.distributed.distributed_c10d import _get_group_size_by_name
+        return _get_group_size_by_name(node.constant_args[-1])
+    else:
+        raise TypeError(f"Unsupported collective type: {node}")
+####################################################################################################################
+# The following code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
+####################################################################################################################
+class NCCL_HW(IntEnum):
+    NVLINK = 0
+    PCI = 1
+    NET = 2
+class NCCL_ALGO(IntEnum):
+    TREE = 0
+    RING = 1
+class NCCL_PROTO(IntEnum):
+    # The ordering and enum values here matches original in
+    # https://github.com/NVIDIA/nccl/blob/0b083e52096c387bad7a5c5c65b26a9dca54de8c/src/include/devcomm.h#L28
+    # For difference between these protocols, see https://github.com/NVIDIA/nccl/issues/281#issuecomment-571816990
+    LL = 0  # Low-latency
+    # LL128 = 1   # Low-latency 128-byte
+    # SIMPLE = 2
+# Latencies in us
+# len(NCCL_ALGO) x len(NCCL_PROTO)
+# NOTE: use array instead of tensor to prevent incompatibility with fake mode
+baseLat = [
+    # Tree
+    [
+        6.8,  # LL
+    ],
+    # Ring
+    [
+        6.6,  # LL
+    ],
+]
+# Latencies in us
+# len(NCCL_HW) x len(NCCL_ALGO) x len(NCCL_PROTO)
+hwLat = [
+    # NVLINK
+    [
+        [0.6],  # Tree (LL)
+        [0.6],  # Ring (LL)
+    ],
+    # PCI
+    [
+        [1.0],  # Tree (LL)
+        [1.0],  # Ring (LL)
+    ],
+    # NET
+    [
+        [5.0],  # Tree (LL)
+        [2.7],  # Ring (LL)
+    ],
+]
+# LL128 max BW per channel
+llMaxBws = [
+    # Volta-N1/Intel-N2/Intel-N4
+    [
+        39.0,
+        39.0,
+        20.4,
+    ],
+    # Ampere-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+    # Hopper-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+]
+def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
+    """
+    Returns estimated NCCL collective runtime in nanoseconds (ns).
+    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
+    We aim to estimate the runtime as accurately as possible.
+    Assumptions:
+    - only ring algorithm (NCCL_ALGO_RING) is used
+    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
+    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    - collective is one of: allreduce, reducescatter, allgather
+    """
+    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
+    # Convert bytes to GB
+    tensor_storage_size_GB = tensor_storage_size_bytes / 1024 / 1024 / 1024
+    # Currently assumes each node has 8 gpus. And when >1 node is used, assumes each node uses all 8 gpus.
+    # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    num_gpus_per_node = 8
+    group_size = get_collective_group_size(node)
+    nNodes = math.ceil(group_size / num_gpus_per_node)
+    nRanks = group_size  # this is total # of gpus globally that participate in this collective op
+    if nRanks <= 1:
+        return 0
+    # Assumes ring algorithm
+    nccl_algo = NCCL_ALGO.RING
+    nccl_proto = NCCL_PROTO.LL
+    coll = get_collective_type(node)
+    # =============== bandwidth computation ===============
+    # First compute bandwidth in GB/s; then at the end, convert it to GB/ns
+    bwIntra = torch._inductor.config.intra_node_bw
+    bwInter = torch._inductor.config.inter_node_bw
+    compCapIndex = get_gpu_type()
+    index2 = nNodes - 1 if nNodes <= 2 else 2
+    # LL: for single node, we look at GPU type; for multi-node, we look at CPU type
+    index1 = compCapIndex if nNodes == 1 else 0
+    llMaxBw = llMaxBws[index1][index2]
+    # NOTE: each step of ring algorithm is synchronized,
+    # and is bottlenecked by the slowest link which is the inter-node interconnect.
+    # hence when nNodes >= 2, bw is inter-node bandwidth.
+    # NOTE: the original code in https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc
+    # have this as `if nNodes <= 2` which seems wrong. Corrected it here.
+    bw = bwIntra if nNodes == 1 else bwInter
+    nChannels = 2  # Assume # channels is 2
+    busBw = nChannels * bw
+    # Various model refinements
+    busBw = min(
+        llMaxBw,
+        busBw
+        * (1.0 / 4.0 if (nNodes > 1 or coll == NCCL_COLL.ALL_REDUCE) else 1.0 / 3.0),
+    )
+    if coll == NCCL_COLL.ALL_REDUCE:
+        nsteps = 2 * (nRanks - 1)
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+        nsteps = nRanks - 1
+    # Convert bus BW to algorithm BW (tensor bytes / algoBW = actual execution time)
+    ratio = (1.0 * nRanks) / nsteps  # type: ignore[possibly-undefined]
+    bandwidth = busBw * ratio
+    # Convert GB/s to GB/ns
+    bandwidth_GB_per_ns = bandwidth / 1e9
+    # =============== latency computation ===============
+    intraHw = NCCL_HW.NVLINK
+    if coll == NCCL_COLL.ALL_REDUCE:
+        if nNodes > 1:
+            nInterSteps = 2 * nNodes
+        else:
+            nInterSteps = 0
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+        nInterSteps = nNodes - 1
+    # First compute latency in us; then at the end, convert it to ns
+    latency = baseLat[nccl_algo][nccl_proto]
+    intraLat = hwLat[intraHw][nccl_algo][nccl_proto]
+    interLat = hwLat[NCCL_HW.NET][nccl_algo][nccl_proto]
+    # Inter-node rings still have to launch nsteps * net overhead.
+    netOverhead = 0.0
+    if nNodes > 1:
+        netOverhead = 1.0  # getNetOverhead(comm);
+    intraLat = max(intraLat, netOverhead)
+    latency += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat  # type: ignore[possibly-undefined]
+    # Convert us to ns
+    latency_ns = latency * 1e3
+    # =============== final result ===============
+    transport_ns = tensor_storage_size_GB / bandwidth_GB_per_ns
+    return transport_ns + latency_ns
+################################################################################################################
+# The above code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
+################################################################################################################

.venv/lib/python3.11/site-packages/torch/_inductor/comms.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# mypy: allow-untyped-defs
+# pyre-strict
+from __future__ import annotations
+import heapq
+import operator
+import sys
+from collections import defaultdict
+from typing import Dict, List, Set, TYPE_CHECKING
+import torch
+from . import config, ir
+from .dependencies import WeakDep
+from .utils import (
+    contains_collective,
+    contains_wait,
+    find_recursive_deps_of_node,
+    find_recursive_users_of_node,
+    is_collective,
+    is_fallback_op,
+    is_wait,
+)
+overlap_log = torch._logging.getArtifactLogger(__name__, "overlap")
+if TYPE_CHECKING:
+    from .scheduler import BaseSchedulerNode
+def sink_waits(snodes: List[BaseSchedulerNode]) -> List[BaseSchedulerNode]:
+    """
+    Greedily schedules waits as late as possible.
+    """
+    return _schedule_for_comm(
+        snodes, raise_comms=False, sink_waits=True, reorder_for_overlap=False
+    )
+def raise_comms(snodes: List[BaseSchedulerNode]) -> List[BaseSchedulerNode]:
+    """
+    Greedily schedules comms as early as possible.
+    """
+    return _schedule_for_comm(
+        snodes, raise_comms=True, sink_waits=False, reorder_for_overlap=False
+    )
+def reorder_compute_for_overlap(
+    snodes: List[BaseSchedulerNode],
+) -> List[BaseSchedulerNode]:
+    """
+    This achieves the following overall scheduling procedure:
+        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
+            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
+        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
+            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
+            We prioritize compute nodes that are needed sooner.
+        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
+        Step 4: We schedule comm N + 1.
+        Repeat this for subsequent comm nodes.
+    """
+    return _schedule_for_comm(
+        snodes, raise_comms=True, sink_waits=True, reorder_for_overlap=True
+    )
+def _schedule_for_comm(
+    snodes: List[BaseSchedulerNode],
+    raise_comms: bool,
+    sink_waits: bool,
+    reorder_for_overlap: bool,
+) -> List[BaseSchedulerNode]:
+    """
+    Schedule `snodes` for various comm optimization objectives.
+    Args:
+        snodes: the nodes to be scheduled.
+        raise_comms: whether to greedily schedule collectives as early as possible
+        sink_wait: whether to greedily schedule waits as late as possible
+        reorder_compute_for_overlap: whether to reorder compute nodes to
+            optimize for compute/communication overlapping.
+    Returns:
+        The new schedule order.
+    Some notes on the synergy between different options:
+        - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
+        - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
+    """
+    # We assign each node a tuple of scores (score_0, score_1, score_2),
+    # decreasing in importance, with a lower value indicating a higher ranking:
+    #
+    # - score_0: the lowest comm_idx among the comm nodes that the node blocks.
+    # If a node doesn't block any comm nodes, its score_0 is set to
+    # sys.maxsize. This score ensures that comm nodes get scheduled as early as
+    # possible.
+    # - score_1: 1 if the node is a wait node, 0 otherwise. This score ensures
+    # that wait nodes are deferred as late as possible.
+    # - score_2: the index of the node in the original topological order. This
+    # score provides stability in case of ties.
+    #
+    # When only raise_comms is True, only score_0 and score_2 are considered.
+    # When only sink_waits is True, only score_1 and score_2 are considered.
+    # When neither is True, the original order is yielded.
+    buf_name_to_snode = {}
+    name_to_fused_node = {}
+    scores_0, scores_1, scores_2 = {}, {}, {}
+    for idx, snode in enumerate(snodes):
+        for buf_name in snode.get_buffer_names():
+            buf_name_to_snode[buf_name] = snode
+        for op_name in snode.get_operation_names():
+            name_to_fused_node[op_name] = snode
+        name_to_fused_node[snode.get_name()] = snode
+        node_name = snode.get_name()
+        scores_0[node_name] = sys.maxsize
+        scores_1[node_name] = 0
+        scores_2[node_name] = idx
+    comm_idx = 0
+    for snode in snodes:
+        if raise_comms and contains_collective(snode):
+            scores_0[snode.get_name()] = comm_idx
+            for anc in snode.ancestors:
+                anc_fused_name = name_to_fused_node[anc].get_name()
+                scores_0[anc_fused_name] = min(scores_0[anc_fused_name], comm_idx)
+            comm_idx += 1
+        elif sink_waits and contains_wait(snode):
+            scores_1[snode.get_name()] = 1
+    class Runnable:
+        def __init__(self, snode) -> None:
+            self.snode = snode
+            name = next(iter(snode.get_operation_names()))
+            fused_name = name_to_fused_node[name].get_name()
+            self.score = (
+                scores_0[fused_name],
+                scores_1[fused_name],
+                scores_2[fused_name],
+            )
+        def __lt__(self, other):
+            return self.score < other.score
+    unmet_deps: Dict[BaseSchedulerNode, Set[str]] = {
+        snode: {dep.name for dep in snode.unmet_dependencies} for snode in snodes
+    }
+    ready: List[Runnable] = []
+    buffer_users: Dict[str, Set[BaseSchedulerNode]] = defaultdict(set)
+    snode_to_cost = {snode: estimate_op_runtime(snode) for snode in snodes}
+    for snode, deps in unmet_deps.items():
+        if len(deps) == 0:
+            heapq.heappush(ready, Runnable(snode))
+        for dep in deps:
+            buffer_users[dep].add(snode)
+    scheduled = []
+    def schedule(snode):
+        """
+        Schedules `snode` and put all unblocked nodes onto the ready queue.
+        """
+        scheduled.append(snode)
+        for buf_name in snode.get_buffer_names():
+            for snode in buffer_users[buf_name]:
+                unmet_deps[snode].remove(buf_name)
+                if len(unmet_deps[snode]) == 0:
+                    heapq.heappush(ready, Runnable(snode))
+    def get_overlapping_candidate():
+        """
+        Return the next node in the ready queue that's neither a collective or
+        a wait.
+        """
+        candidates = [
+            x
+            for x in ready
+            if not contains_collective(x.snode) and not contains_wait(x.snode)
+        ]
+        if len(candidates) == 0:
+            return None
+        return min(candidates, key=lambda x: x.score)
+    def schedule_collective_for_overlap(snode):
+        """
+        Schedules collective node `snode`, along with one or more compute nodes
+        to overlap with it. The strategy is described in the comment of
+        `reorder_compute_for_overlap`.
+        """
+        assert contains_collective(snode)
+        schedule(snode)
+        collective_cost = snode_to_cost[snode]
+        while (
+            collective_cost > 0
+            and (candidate := get_overlapping_candidate()) is not None
+        ):
+            ready.remove(candidate)
+            schedule(candidate.snode)
+            collective_cost -= snode_to_cost[candidate.snode]
+        heapq.heapify(ready)
+    while len(ready):
+        snode = heapq.heappop(ready).snode
+        if reorder_for_overlap and contains_collective(snode):
+            schedule_collective_for_overlap(snode)
+        else:
+            schedule(snode)
+    for snode, deps in unmet_deps.items():
+        assert len(deps) == 0, (
+            "Detected unscheduled nodes. "
+            f"Nodes with unmet dependencies: {unmet_deps}"
+        )
+    return scheduled
+def decide_global_ordering_of_comms(
+    nodes: List[BaseSchedulerNode], name_to_buf, name_to_fused_node
+) -> List[BaseSchedulerNode]:
+    """
+    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
+    (might not be the same ordering as the eager mode program).
+    TODO: Come up with a better approach
+    """
+    # If FSDP2 is used, we apply FSDP-specific passes.
+    if any(
+        is_fallback_op(
+            x.node,
+            {
+                torch.ops.fsdp.all_gather_copy_in.default,
+                torch.ops.fsdp.chunk_cat.default,
+            },
+        )
+        for x in nodes
+    ):
+        nodes = enforce_comm_ordering_for_fsdp(nodes, name_to_buf, name_to_fused_node)
+    comm_nodes = [n for n in nodes if contains_collective(n)]
+    for i in range(1, len(comm_nodes)):
+        # Enforce ordering by making previous comm a `WeakDep` dependency of the next comm
+        mutating_buf = next(iter(comm_nodes[i].get_buffer_names()))
+        for buf in comm_nodes[i - 1].get_buffer_names():
+            comm_nodes[i].add_fake_dep(WeakDep(buf, mutating_buf=mutating_buf))
+    return nodes
+def estimate_op_runtime(snode: BaseSchedulerNode) -> float:
+    """
+    Returns estimated op runtime in nanoseconds (ns)
+    """
+    if config.estimate_op_runtime == "default":
+        runtime = snode.get_estimated_runtime()
+    else:
+        assert callable(config.estimate_op_runtime)
+        runtime = config.estimate_op_runtime(snode)
+    return runtime
+def node_summary(snode):
+    detail = ""
+    if isinstance(snode.node, ir.ExternKernelOut):
+        detail = f" ({snode.node.python_kernel_name})"
+    out_tensor_info = ""
+    if (
+        hasattr(snode.node, "layout")
+        and hasattr(snode.node.layout, "size")
+        and hasattr(snode.node.layout, "stride")
+    ):
+        out_tensor_info = (
+            f" (size={snode.node.layout.size}, stride={snode.node.layout.stride})"
+        )
+    node_name = ""
+    if hasattr(snode.node, "name"):
+        node_name = snode.node.name
+    return f"{snode.node.__class__.__name__}{detail}{out_tensor_info} ({node_name})"
+def visualize_overlap(order):
+    total_est_runtime: float = 0.0
+    cur_comm_node = None
+    for snode in order:
+        if cur_comm_node is None:
+            if contains_collective(snode):
+                total_est_runtime += estimate_op_runtime(snode)
+                cur_comm_node = snode.node
+            elif is_wait(snode.node):
+                raise AssertionError(
+                    "Wait is not expected when there is no collective running"
+                )
+            else:  # exposed compute op
+                total_est_runtime += estimate_op_runtime(snode)
+            overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+        else:  # cur_comm_node is not None
+            if contains_collective(snode):
+                raise AssertionError(
+                    "Found two collectives running at the same time. "
+                    "`visualize_overlap` needs to be updated to handle this case"
+                )
+            elif is_wait(snode.node):  # end of this comm op
+                overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+                cur_comm_node = None
+            else:  # overlapped compute op
+                overlap_log.debug(f"| {node_summary(snode)}")  # noqa: G004
+    overlap_log.debug(
+        f"Est. runtime (ms): {total_est_runtime / 1000 / 1000}"  # noqa: G004
+    )
+def reorder_compute_and_comm_for_overlap(
+    snodes: List[BaseSchedulerNode],
+) -> List[BaseSchedulerNode]:
+    order = snodes
+    for p in config.reorder_for_compute_comm_overlap_passes:
+        if isinstance(p, str) and p in globals():
+            p = globals()[p]  # it is a builtin pass
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap before reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+        order = p(order)  # type: ignore[operator]
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap after reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+    return order
+def reinplace_fsdp_all_gather(graph: torch.fx.Graph) -> None:
+    try:
+        import torch.distributed._composable.fsdp._fsdp_collectives
+        assert torch.distributed.is_available()
+        # Assert existence of these ops
+        assert (
+            torch.ops._c10d_functional.all_gather_into_tensor
+            and torch.ops._c10d_functional.all_gather_into_tensor_out
+        )
+    except (ImportError, AttributeError, AssertionError):
+        return
+    from .pattern_matcher import (
+        CallFunction,
+        KeywordArg,
+        Match,
+        PatternMatcherPass,
+        register_graph_pattern,
+    )
+    """
+    all_gather_copy_in = torch.ops.fsdp.all_gather_copy_in.default(...);
+    getitem = all_gather_copy_in[0];
+    (getitem_1 = all_gather_copy_in[1];)  # optional
+    all_gather_into_tensor = torch.ops._c10d_functional.all_gather_into_tensor.default(getitem, ...);
+    ->
+    all_gather_copy_in = torch.ops.fsdp.all_gather_copy_in.default(...);
+    getitem = all_gather_copy_in[0];
+    getitem_1 = all_gather_copy_in[1];
+    all_gather_into_tensor = torch.ops._c10d_functional.all_gather_into_tensor_out.default(getitem, ..., out=getitem_1);
+    """
+    def remove_unused_getitem(g):
+        # Remove `getitem_X = all_gather_copy_in[1]` which is never used.
+        node_list = list(g.nodes)
+        for n in node_list:
+            if (
+                n.target == operator.getitem
+                and n.args[0].target is torch.ops.fsdp.all_gather_copy_in.default
+                and n.args[1] == 1
+            ):
+                g.erase_node(n)
+    graph_pass = PatternMatcherPass()
+    @register_graph_pattern(
+        CallFunction(
+            torch.ops._c10d_functional.all_gather_into_tensor.default,
+            CallFunction(
+                operator.getitem,
+                CallFunction(
+                    torch.ops.fsdp.all_gather_copy_in.default,
+                    KeywordArg("all_gather_inputs"),
+                    KeywordArg("inp_split_sizes"),
+                    KeywordArg("all_gather_input_numel"),
+                    KeywordArg("world_size"),
+                    KeywordArg("rank"),
+                    KeywordArg("dtype"),
+                    KeywordArg("device"),
+                ),
+                KeywordArg("item_idx"),
+            ),
+            KeywordArg("group_size"),
+            KeywordArg("group_name"),
+        ),
+        pass_dict=graph_pass,
+        extra_check=lambda match: match.kwargs["item_idx"] == 0,
+    )
+    def reinplace_all_gather(match: Match, *args, **kwargs):
+        def repl(
+            *args,
+        ):
+            copy_in_args = args[:-2]
+            group_size = args[-2]
+            group_name = args[-1]
+            all_gather_copy_in = torch.ops.fsdp.all_gather_copy_in.default(
+                *copy_in_args
+            )
+            getitem = all_gather_copy_in[0]
+            getitem_1 = all_gather_copy_in[1]
+            all_gather_into_tensor = (
+                torch.ops._c10d_functional.all_gather_into_tensor_out.default(
+                    getitem, group_size, group_name, out=getitem_1
+                )
+            )
+            return all_gather_into_tensor
+        match.replace_by_example(
+            repl,
+            [
+                kwargs["all_gather_inputs"],
+                kwargs["inp_split_sizes"],
+                kwargs["all_gather_input_numel"],
+                kwargs["world_size"],
+                kwargs["rank"],
+                kwargs["dtype"],
+                kwargs["device"],
+                kwargs["group_size"],
+                kwargs["group_name"],
+            ],
+        )
+    remove_unused_getitem(graph)
+    graph_pass.apply(graph)  # type: ignore[arg-type]
+def get_op_idx(snode):
+    assert not isinstance(
+        snode,
+        (
+            torch._inductor.scheduler.FusedSchedulerNode,
+            torch._inductor.scheduler.GroupedSchedulerNode,
+        ),
+    )
+    return int(snode.get_name()[2:])
+def enforce_comm_ordering_for_fsdp(
+    snodes: List[torch._inductor.scheduler.BaseSchedulerNode],
+    name_to_buf: Dict[str, torch._inductor.scheduler.SchedulerBuffer],
+    name_to_fused_node: Dict[str, BaseSchedulerNode],
+) -> List[torch._inductor.scheduler.BaseSchedulerNode]:
+    from . import scheduler
+    new_order: list[BaseSchedulerNode] = []
+    scheduled = set()
+    ag_exists = False
+    rs_exists = False
+    ag_grouped_node_to_wait_grouped_node = {}
+    rs_grouped_node_to_wait_grouped_node = {}
+    snode_name_to_final_snode = {}
+    def _create_group_node(snodes_to_group):
+        group_node = scheduler.GroupedSchedulerNode.create(snodes_to_group)
+        for snode in snodes_to_group:
+            snode_name_to_final_snode[snode.get_name()] = group_node
+        snode_name_to_final_snode[group_node.get_name()] = group_node
+        return group_node
+    # Create grouped nodes for specific sets of ops
+    for snode in snodes:
+        # Case 1: Handle AllGather
+        if is_collective(
+            snode.node, op=torch.ops._c10d_functional.all_gather_into_tensor_out.default
+        ) and any(
+            is_fallback_op(
+                name_to_fused_node[x].node, torch.ops.fsdp.all_gather_copy_in.default
+            )
+            for x in snode.ancestors
+        ):
+            ag_exists = True
+            ag_snode = snode
+            ag_related_snode_set: set[scheduler.BaseSchedulerNode] = set()
+            # Find the "cast + copy_in + getitem + all_gather" code block
+            find_recursive_deps_of_node(
+                ag_snode,
+                ag_related_snode_set,
+                name_to_buf,
+                name_to_fused_node,
+            )
+            # Find the "all_gather + all_gather_wait_tensor + copy_out + set_" code block
+            allowed_ops = {
+                torch.ops._c10d_functional.all_gather_into_tensor_out.default,
+                torch.ops._c10d_functional.wait_tensor.default,
+                torch.ops.fsdp.split_with_sizes_copy.default,
+                torch.ops.aten.set_.source_Tensor,
+            }
+            find_recursive_users_of_node(
+                ag_snode,
+                ag_related_snode_set,
+                name_to_buf,
+                name_to_fused_node,
+                criteria_cb=lambda x: not (
+                    isinstance(x, scheduler.NopKernelSchedulerNode)
+                    or (
+                        isinstance(x, scheduler.ExternKernelSchedulerNode)
+                        and x.node.op_overload in allowed_ops  # type: ignore[union-attr]
+                    )
+                ),
+            )
+            # sort nodes by original operation order
+            ag_related_snodes = sorted(
+                ag_related_snode_set, key=lambda x: get_op_idx(x)
+            )
+            # In the "reuse layer" case, some ops in the 2nd all-gather code block could also
+            # depend on ops in the 1st all-gather code block, and we don't want to group them together.
+            end_idx_of_current_ag_block = len(ag_related_snodes)
+            copy_out_count = 0
+            for i in range(len(ag_related_snodes)):
+                cur_snode = ag_related_snodes[i]
+                if is_fallback_op(
+                    cur_snode.node, torch.ops.fsdp.split_with_sizes_copy.default
+                ):
+                    copy_out_count += 1
+                if copy_out_count > 1:
+                    end_idx_of_current_ag_block = i
+                    break
+            ag_related_snodes = ag_related_snodes[:end_idx_of_current_ag_block]
+            # Group "cast + copy_in + getitem + all_gather" into one GroupedSchedulerNode
+            wait_node_idx = None
+            for i in range(len(ag_related_snodes) - 1):
+                if isinstance(ag_related_snodes[i + 1].node, ir._WaitKernel):
+                    wait_node_idx = i + 1
+                    break
+            assert wait_node_idx is not None
+            ag_group_node = _create_group_node(ag_related_snodes[:wait_node_idx])
+            # Group "all_gather_wait_tensor + copy_out + set_" into one GroupedSchedulerNode
+            ag_wait_group_node = _create_group_node(ag_related_snodes[wait_node_idx:])
+            ag_grouped_node_to_wait_grouped_node[ag_group_node] = ag_wait_group_node
+        # Case 2: Handle ReduceScatter
+        elif is_fallback_op(snode.node, torch.ops.fsdp.chunk_cat.default):
+            rs_exists = True
+            rs_snode = snode
+            # Find the "reduce_scatter copy-in + reduce_scatter comm + reduce_scatter wait" code block
+            rs_related_snode_set: set[scheduler.BaseSchedulerNode] = set()
+            find_recursive_users_of_node(
+                rs_snode,
+                rs_related_snode_set,
+                name_to_buf,
+                name_to_fused_node,
+            )
+            # sort nodes by original operation order
+            rs_related_snodes = sorted(
+                rs_related_snode_set, key=lambda x: get_op_idx(x)
+            )
+            # Group "reduce_scatter copy-in + reduce_scatter comm" into one GroupedSchedulerNode
+            wait_node_idx = None
+            for i in range(len(rs_related_snodes) - 1):
+                if isinstance(rs_related_snodes[i + 1].node, ir._WaitKernel):
+                    wait_node_idx = i + 1
+                    break
+            assert wait_node_idx is not None
+            rs_group_node = _create_group_node(rs_related_snodes[:wait_node_idx])
+            # Group "reduce_scatter wait + related output nodes" into one GroupedSchedulerNode
+            rs_wait_group_node = _create_group_node(rs_related_snodes[wait_node_idx:])
+            rs_grouped_node_to_wait_grouped_node[rs_group_node] = rs_wait_group_node
+    assert len(snode_name_to_final_snode) > 0
+    if ag_exists:
+        assert len(ag_grouped_node_to_wait_grouped_node) > 0
+    if rs_exists:
+        assert len(rs_grouped_node_to_wait_grouped_node) > 0
+    # Build the new node schedule, taking GroupedSchedulerNode into account
+    for snode in snodes:
+        if snode.get_name() in snode_name_to_final_snode:
+            snode = snode_name_to_final_snode[snode.get_name()]
+        if snode in scheduled:
+            continue
+        new_order.append(snode)
+        scheduled.add(snode)
+    # Enforce AllGather ordering: previous AllGather's "wait then copy_out" group node must run
+    # before next AllGather's "copy_in then AG" group node
+    prev_ag_wait = None
+    for ag_group_node, wait_group_node in ag_grouped_node_to_wait_grouped_node.items():
+        if prev_ag_wait is not None:
+            mutating_buf = next(iter(ag_group_node.get_buffer_names()))
+            for o in prev_ag_wait.get_outputs():
+                ag_group_node.add_fake_dep(
+                    WeakDep(o.get_name(), mutating_buf=mutating_buf)
+                )
+        prev_ag_wait = wait_group_node
+    # Enforce ReduceScatter ordering: previous ReduceScatter's "wait" group node must run
+    # before next ReduceScatter's "copy_in then RS" group node
+    prev_rs_wait = None
+    for rs_group_node, wait_group_node in rs_grouped_node_to_wait_grouped_node.items():
+        if prev_rs_wait is not None:
+            mutating_buf = next(iter(rs_group_node.get_buffer_names()))
+            for o in prev_rs_wait.get_outputs():
+                rs_group_node.add_fake_dep(
+                    WeakDep(o.get_name(), mutating_buf=mutating_buf)
+                )
+        prev_rs_wait = wait_group_node
+    return new_order  # type: ignore[return-value]

.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py ADDED Viewed

	@@ -0,0 +1,1629 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import contextlib
+import functools
+import io
+import itertools
+import logging
+import os
+import sys
+import time
+import warnings
+from itertools import count
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from unittest import mock
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+import torch.fx
+import torch.utils._pytree as pytree
+from functorch.compile import min_cut_rematerialization_partition
+from torch._dynamo import (
+    compiled_autograd,
+    config as dynamo_config,
+    logging as dynamo_logging,
+    utils as dynamo_utils,
+)
+from torch._dynamo.device_interface import get_interface_for_device
+from torch._dynamo.repro.after_aot import wrap_compiler_debug
+from torch._dynamo.utils import (
+    counters,
+    detect_fake_mode,
+    flatten_graph_inputs,
+    lazy_format_graph_code,
+)
+from torch._functorch import config as functorch_config
+from torch._functorch.aot_autograd import aot_export_module, make_boxed_func
+from torch._inductor.codecache import (
+    _StrideExprStr,
+    code_hash,
+    CompiledFxGraph,
+    FxGraphCache,
+)
+from torch._inductor.cudagraph_utils import (
+    BoxedDeviceIndex,
+    CudagraphCachedInfo,
+    get_placeholder_info,
+    log_cudagraph_skip_and_bump_counter,
+    PlaceholderInfo,
+)
+from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    fresh_inductor_cache,
+    InputType,
+    is_gpu,
+    should_assume_input_aligned,
+    tensor_is_aligned,
+)
+from torch._logging import trace_structured
+from torch._ops import OpOverload
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, SymExprPrinter
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.monitor import _WaitCounter
+from .._dynamo.backends.common import aot_autograd
+from ..fx._lazy_graph_module import _use_lazy_graph_module  # type: ignore[attr-defined]
+from ..fx.graph import _PyTreeCodeGen
+from . import config, metrics
+from .debug import DebugContext
+from .decomposition import select_decomp_table
+from .fx_passes.joint_graph import joint_graph_passes
+from .fx_passes.post_grad import post_grad_passes, view_to_reshape
+from .fx_passes.pre_grad import pre_grad_passes
+from .graph import GraphLowering
+from .ir import ExternKernelNode
+from .utils import (
+    align_inputs_from_check_idxs,
+    clone_preserve_strides,
+    copy_misaligned_inputs,
+    get_cloned_parameter_buffer_name,
+    has_incompatible_cudagraph_ops,
+    maybe_get_suppress_shape_guards_ctx,
+    output_node,
+    remove_unaligned_input_idxs,
+    shape_env_from_inputs,
+)
+from .virtualized import V
+if config.is_fbcode():
+    from torch._inductor.fb.utils import log_optimus_to_scuba, time_and_log
+else:
+    # no-op decorator
+    def time_and_log(attr: str):
+        return dynamo_utils.identity
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+post_grad_graphs_log = torch._logging.getArtifactLogger(__name__, "post_grad_graphs")
+static_inputs_log = torch._logging.getArtifactLogger(
+    __name__, "cudagraph_static_inputs"
+)
+# copy_ fails when trying to write to tensors with memory overlap,
+# for expanded dimensions (a dimension which used to have size 1 -> ?)
+# we can select one element from that dimension and write to it
+# to achieve writing to all values of that dimension of the input tensor
+def get_expanded_dims(t):
+    if not isinstance(t, torch.Tensor):
+        return None
+    return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
+def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tensor:
+    for expanded_dim in expanded_dims:
+        t = torch.ops.aten.slice(t, expanded_dim, 0, 1)
+    return t
+def complex_memory_overlap(t: torch.Tensor) -> bool:
+    # if torch._debug_has_internal_overlap thinks this tensor potentially has
+    # memory overlap internally, let's dig deeper to find out whether it's true.
+    #
+    # Call squeeze() so that dimension with size 1 does not cause false positive.
+    t = index_expanded_dims(t, get_expanded_dims(t)).squeeze()
+    if torch._debug_has_internal_overlap(t) != 0:
+        strides = t.stride()
+        sizes = t.shape
+        indices = list(range(len(strides)))
+        indices = [x for _, x in sorted(zip(strides, indices))]
+        for i in range(len(strides)):
+            prev_stride = 1 if i == 0 else strides[indices[i - 1]]
+            prev_size = 1 if i == 0 else sizes[indices[i - 1]]
+            if strides[indices[i]] < prev_stride * prev_size:
+                return True
+    return False
+def get_static_input_idxs(num_fixed):
+    # If we are inlining NNModules, we treat all torch.nn.Parameters as static for the purposes
+    # of cudagraphs. Rather than copying these into cudagraph-owned memory
+    # like we do for normal inputs on each run, we will re-record a cudagraph if these
+    # parameter locations change.
+    context = torch._guards.TracingContext.try_get()
+    fixed = list(range(num_fixed))
+    if not context or not context.fw_metadata:
+        return fixed
+    return fixed + context.fw_metadata.static_input_indices
+@functools.lru_cache(None)
+def _step_logger():
+    return dynamo_logging.get_step_logger(log)
+@functools.lru_cache(None)
+def _warn_tf32_disabled():
+    if (
+        torch.cuda.is_available()
+        and not torch.backends.cuda.matmul.allow_tf32
+        and torch.cuda.get_device_capability() >= (8, 0)
+    ):
+        warnings.warn(
+            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. "
+            "Consider setting `torch.set_float32_matmul_precision('high')` for better performance."
+        )
+def _unlift_graph(mod, gm, graph_signature):
+    from torch.export.unflatten import _assign_attr, _AttrKind
+    state_dict = {}
+    for name, param in mod.named_parameters(remove_duplicate=False):
+        state_dict[name] = param
+        _assign_attr(
+            param,
+            gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        state_dict[name] = buffer
+        _assign_attr(
+            buffer,
+            gm,
+            name,
+            attr_kind=_AttrKind.BUFFER,
+        )
+    placeholder_nodes = gm.graph.find_nodes(op="placeholder")
+    lifted_inputs = []
+    # In AOTI, module parameters and buffers are not lifted as graph inputs.
+    # As a result, mutation to buffers has side effect which makes their initial
+    # values different from Eager. So we clone them here as a copy.
+    # We are not cloning for parameters, although it will be needed if we want to
+    # support training.
+    for node in placeholder_nodes:
+        node_name = node.name
+        if node_name in graph_signature.inputs_to_parameters:
+            parameter_name = graph_signature.inputs_to_parameters[node_name]
+            lifted_inputs.append(parameter_name)
+        elif node_name in graph_signature.inputs_to_buffers:
+            buffer_name = graph_signature.inputs_to_buffers[node_name]
+            lifted_inputs.append(buffer_name)
+            gm.meta[
+                get_cloned_parameter_buffer_name(buffer_name)
+            ] = clone_preserve_strides(state_dict[buffer_name])
+        else:
+            assert node_name in graph_signature.user_inputs
+            lifted_inputs.append(None)
+    from torch.export._unlift import _unlift
+    outputs = list(gm.graph.nodes)[-1].args[0]
+    mutated_outputs = []
+    buffer_mutations = graph_signature.buffers_to_mutate
+    user_input_mutations = graph_signature.user_inputs_to_mutate
+    output_tokens = graph_signature.output_tokens
+    for idx, out in enumerate(outputs):
+        value = None
+        if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
+            if out.name in buffer_mutations:
+                value = buffer_mutations[out.name]
+            elif out.name in user_input_mutations:
+                value = user_input_mutations[out.name]
+        mutated_outputs.append(value)
+    unlifted_gm = _unlift(
+        gm,
+        lifted_inputs,
+        mutated_outputs,
+        pytree.LeafSpec(),
+        None,
+        state_dict,
+        {},
+    )
+    return unlifted_gm
+def _get_subgraph_names(gm):
+    for node in sorted(
+        itertools.chain(
+            gm.graph.find_nodes(op="call_function", target=torch.ops.higher_order.cond),
+            gm.graph.find_nodes(
+                op="call_function", target=torch.ops.higher_order.while_loop
+            ),
+        )
+    ):
+        if node.target == torch.ops.higher_order.cond:
+            true_subgraph_name = node.args[1].name
+            false_subgraph_name = node.args[2].name
+            yield true_subgraph_name
+            yield false_subgraph_name
+        elif node.target == torch.ops.higher_order.while_loop:
+            cond_subgraph_name = node.args[0].name
+            body_subgraph_name = node.args[1].name
+            yield cond_subgraph_name
+            yield body_subgraph_name
+def _recursive_pre_grad_passes(gm, example_inputs):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        # as we don't have recursive example inputs, passing None here
+        new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None)
+        setattr(gm, subgraph_name, new_subgraph)
+    return pre_grad_passes(gm, example_inputs)
+def _recursive_joint_graph_passes(gm):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_joint_graph_passes(subgraph)
+    joint_graph_passes(gm)
+def _recursive_post_grad_passes(gm, is_inference: bool = False):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_post_grad_passes(subgraph, is_inference)
+    post_grad_passes(gm, is_inference)
+def split_const_gm(
+    gm: torch.fx.GraphModule,
+    lifted_constants: Optional[Dict[str, Any]] = None,
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+) -> Tuple[torch.fx.GraphModule, Dict[str, int]]:
+    """
+    This function takes an GraphModule input "gm".
+    The gm will be split into 2 components,
+      1) const_gm, which consists the subgraph of gm that can be constant folded.
+      2) gm (being inplace modified,) which returns the graph after constant folding.
+    If an additional "lifted_constants" argument is passed in, we will assume the gm has
+    been lifted and run the transformation accordingly.
+    When a "skip_folding_node_fn" callback is passed, we will skip constant folding on
+    the nodes for which the callback returns True.
+    const_output_index is a mapping of corresponding node name from gm to the
+    output index of const_gm.
+    Returns (const_gm, const_output_index)
+    """
+    from torch._inductor.constant_folding import (
+        CONST_MODULE_TAG,
+        META_TAG,
+        MODULE_TAG,
+        replace_node_with_constant,
+        run_and_get_constant_graph,
+    )
+    const_gm, const_result = run_and_get_constant_graph(
+        gm, lifted_constants, skip_folding_node_fn
+    )
+    const_outputs = {
+        x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
+    }
+    to_erase_node = []
+    to_replace_node = []
+    const_output_index = {}
+    for node in gm.graph.nodes:
+        if node.name in const_outputs:
+            to_replace_node.append(node)
+        elif node.meta[META_TAG] == CONST_MODULE_TAG and node.op != "placeholder":
+            to_erase_node.append(node)
+    for node in to_replace_node:
+        new_const_name = "_FOLDED_CONST_" + node.name
+        replace_node_with_constant(
+            gm,
+            node,
+            const_result[const_outputs[node.name]],
+            new_const_name,
+        )
+        const_output_index[new_const_name] = const_outputs[node.name]
+    for node in to_erase_node[::-1]:
+        if node.users:
+            for n in node.users:
+                assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty."
+        else:
+            gm.graph.erase_node(node)
+    gm.recompile()
+    return const_gm, const_output_index
+def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
+    aten = torch.ops.aten
+    tf32_ops = {
+        aten.mm.default,
+        aten.addmm.default,
+        aten.bmm.default,
+        aten.baddbmm.default,
+    }
+    for target in tf32_ops:
+        for node in gm.graph.find_nodes(op="call_function", target=target):
+            if (
+                isinstance(node.meta.get("val", None), torch.Tensor)
+                and node.meta["val"].dtype == torch.float32
+                and node.meta["val"].device.type == "cuda"
+            ):
+                return True
+    return False
+def maybe_disable_comprehensive_padding(example_inputs: List[torch.Tensor]):
+    """
+    For CPU backend, enable comprehensive padding causes some unit tests
+    fail due to changing number of generated kernels. Skip for now.
+    """
+    has_gpu = any(
+        is_gpu(t.device.type) for t in example_inputs if isinstance(t, torch.Tensor)
+    )
+    if config.disable_padding_cpu and config.comprehensive_padding and not has_gpu:
+        perf_hint_log.info("Skip comprehensive padding on CPU")
+        return config.patch(comprehensive_padding=False)
+    else:
+        return contextlib.nullcontext()
+def fake_tensor_prop(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    force_allow_non_fake_inputs: bool = False,
+):
+    """
+    If we can not detect fake mode from the context of inputs, create one.
+    The created fake mode will be returned.
+    """
+    fake_mode = detect_fake_mode(example_inputs)
+    if not fake_mode:
+        fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        FakeTensorProp(gm, mode=fake_mode).propagate(*example_inputs)
+    else:
+        ctx = (
+            contextlib.nullcontext()
+            if not force_allow_non_fake_inputs
+            else mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+        )
+        with ctx:  # type: ignore[attr-defined]
+            FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(
+                *example_inputs
+            )
+    return fake_mode
+def should_use_remote_fx_graph_cache():
+    if config.fx_graph_remote_cache is not None:
+        return config.fx_graph_remote_cache
+    if not config.is_fbcode():
+        return False
+    if torch._utils_internal.is_fb_unit_test():
+        return False
+    try:
+        from torch._inductor.fb.remote_cache import REMOTE_CACHE_VERSION
+    except ModuleNotFoundError:
+        return False
+    jk_name = "pytorch/remote_cache:fx_graph_memcache_version"
+    if torch.version.hip is not None:
+        jk_name = "pytorch/remote_cache:fx_graph_memcache_version_amd"
+    return REMOTE_CACHE_VERSION >= torch._utils_internal.justknobs_getval_int(jk_name)
+# pass config dict back to user
+def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
+    with config.patch(config_patches):
+        return config.get_config_copy()
+@contextlib.contextmanager
+def with_fresh_cache_if_config():
+    if config.force_disable_caches:
+        # Don't delete the cache dir because it has to survive beyond the
+        # compile_fx call. Let's put the temp dirs under the default cache
+        # dir so they're easier to locate.
+        with fresh_inductor_cache(dir=cache_dir(), delete=False):
+            yield
+    else:
+        yield
+def compile_fx_inner(*args, **kwargs):
+    # Need with_fresh_cache_if_config for compile_fx_inner even if we already have one for
+    # compile_fx. The reason is the compilation for backward graph may happen after
+    # compile_fx return and we may want to use the _LazyGraphModule for compiling
+    # the backward graph as well.
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(torch.utils._python_dispatch._disable_current_modes())
+        stack.enter_context(_use_lazy_graph_module(dynamo_config.use_lazy_graph_module))
+        stack.enter_context(
+            dynamo_utils.dynamo_timed(
+                "compile_fx_inner", phase_name="inductor_compile", fwd_only=False
+            )
+        )
+        stack.enter_context(with_fresh_cache_if_config())
+        stack.enter_context(DebugContext())
+        return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")(
+            *args, **kwargs
+        )
+@time_and_log(attr="compilation time (in seconds)")
+def _compile_fx_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    static_input_idxs: Optional[List[int]] = None,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    user_visible_outputs: Optional[Dict[str, None]] = None,
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    """
+    Inductor API that compiles a single graph.
+    If you change the argument list for this function, make sure you
+    also update the call to save_args_for_compile_fx_inner below accordingly.
+    """
+    if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+        _LazyGraphModule.force_recompile(gm)
+        return make_boxed_func(gm.forward)
+    if static_input_idxs is None:
+        static_input_idxs = []
+    static_inputs_log.debug("static input idxs compile_fx_inner: %s", static_input_idxs)
+    assert isinstance(
+        next(iter(reversed(gm.graph.nodes))).args[0], (tuple, list)
+    ), f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}"
+    if config.save_args:
+        save_args_for_compile_fx_inner(
+            gm,
+            example_inputs,
+            cudagraphs=cudagraphs,
+            static_input_idxs=static_input_idxs,
+            is_backward=is_backward,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            is_inference=is_inference,
+            boxed_forward_device_index=boxed_forward_device_index,
+            user_visible_outputs=user_visible_outputs,
+            layout_opt=layout_opt,
+        )
+    if cudagraphs is None:
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+    # Inputs to fx_codegen_and_compile
+    # Anything that affects codegen should go here, so if the signature
+    # of fx_codegen_and_compile changes, the dict should be updated accordingly
+    graph_kwargs = {
+        "cudagraphs": cudagraphs,
+        "static_input_idxs": static_input_idxs,
+        "is_backward": is_backward,
+        "graph_id": graph_id,
+        "cpp_wrapper": cpp_wrapper,
+        "aot_mode": aot_mode,
+        "is_inference": is_inference,
+        "user_visible_outputs": user_visible_outputs,
+        "layout_opt": layout_opt,
+        "extern_node_serializer": extern_node_serializer,
+    }
+    start = time.time()
+    fx_graph_remote_cache = should_use_remote_fx_graph_cache()
+    inputs_to_check = get_input_idxs_to_check(example_inputs, static_input_idxs)  # type: ignore[arg-type]
+    def codegen_and_compile(
+        gm,
+        example_inputs,
+        inputs_to_check,
+        fx_kwargs,
+    ):
+        """
+        This function calls fx_codegen_and_compile and also adds some extra metadata to the resulting
+        compiled fx graph. The metadata is saved to FXGraphCache.
+        """
+        compiled_graph = fx_codegen_and_compile(gm, example_inputs, **fx_kwargs)
+        if isinstance(compiled_graph, str):
+            # We only return a string in aot mode, in which case we don't
+            # need to do any post-compilation steps: we just return the string,
+            # which is the filename of the compiled code.
+            return compiled_graph
+        cudagraph_info = None
+        if cudagraphs:
+            # check cudagraph disabling reasons from inductor lowering
+            if compiled_graph.disabled_cudagraphs_reason:
+                if "cuda" in compiled_graph.device_types:
+                    log_cudagraph_skip_and_bump_counter(
+                        f"skipping cudagraphs due to {compiled_graph.disabled_cudagraphs_reason}"
+                    )
+                else:
+                    counters["inductor"]["cudagraph_skips"] += 1
+                BoxedBool.disable(cudagraphs)
+            else:
+                complex_memory_overlap_inputs = any(
+                    complex_memory_overlap(t)
+                    for t in example_inputs
+                    if isinstance(t, torch.Tensor)
+                )
+                if not config.triton.cudagraph_support_input_mutation:
+                    # Skip supports for cudagraph-managed tensors
+                    from torch._inductor.cudagraph_utils import (
+                        check_for_mutation_ignore_cuda_graph_managed_tensor,
+                    )
+                    has_mutation_str = (
+                        check_for_mutation_ignore_cuda_graph_managed_tensor(
+                            gm,
+                            compiled_graph,
+                            static_input_idxs,  # type:ignore[arg-type]
+                        )
+                    )
+                    has_mutation = has_mutation_str is not None
+                    if has_mutation:
+                        compiled_graph.disabled_cudagraphs_reason = has_mutation_str
+                else:
+                    # Check mutation later to support cudagraph-managed tensors
+                    has_mutation = None
+                cudagraph_tests = [
+                    (not has_mutation, "mutated inputs"),
+                    (not has_incompatible_cudagraph_ops(gm), "incompatible ops"),
+                    (not complex_memory_overlap_inputs, "complex memory overlap"),
+                    (
+                        all(
+                            isinstance(t, (torch.Tensor, torch.SymInt))
+                            for t in example_inputs
+                        ),
+                        "non-Tensor inputs",
+                    ),
+                ]
+                output = output_node(gm)
+                # output args are tuple of first argument
+                assert len(output.args) == 1
+                stack_traces = [
+                    (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+                    for arg in output.args[0]
+                ]
+                cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b]
+                placeholders = tuple(get_placeholder_info(gm.graph))
+                cudagraph_info = CudagraphCachedInfo(
+                    placeholders, stack_traces, cudagraph_fail_reasons
+                )
+        compiled_graph.cudagraph_info = cudagraph_info
+        compiled_graph.inputs_to_check = inputs_to_check
+        compiled_graph.fx_kwargs = fx_kwargs
+        # TODO: should this be part of fx_kwargs
+        compiled_graph.boxed_forward_device_index = boxed_forward_device_index
+        return compiled_graph
+    with _WaitCounter("pytorch.wait_counter.fx_codegen_and_compile").guard() as _:
+        if (
+            not config.force_disable_caches
+            and (config.fx_graph_cache or fx_graph_remote_cache)
+            and not aot_mode
+        ):
+            for i, input in enumerate(example_inputs):
+                if (
+                    isinstance(input, torch.Tensor)
+                    and input.device.type == "cuda"
+                    and i in static_input_idxs
+                ):
+                    input._is_inductor_static = True  # type: ignore[attr-defined]
+            compiled_graph = FxGraphCache.load(
+                codegen_and_compile,
+                gm,
+                example_inputs,
+                graph_kwargs,
+                inputs_to_check,
+                local=config.fx_graph_cache,
+                remote=fx_graph_remote_cache,
+            )
+        else:
+            compiled_graph = codegen_and_compile(
+                gm, example_inputs, inputs_to_check, graph_kwargs  # type: ignore[arg-type]
+            )
+            if aot_mode:
+                # AOT mode is special because codegen_and_compile returns a string.
+                # In that case, we don't need to run all post compilation steps, we just need
+                # to return the string directly.
+                return compiled_graph
+            compiled_graph = FxGraphCache.post_compile(
+                compiled_graph, example_inputs, cudagraphs
+            )
+    log.debug("FX codegen and compilation took %.3fs", time.time() - start)
+    _step_logger()(
+        logging.INFO,
+        "torchinductor done compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+    # aot autograd needs to know to pass in inputs as a list
+    compiled_graph._boxed_call = True
+    return compiled_graph
+def fx_codegen_and_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    static_input_idxs: Optional[List[int]] = None,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    # Use a dict with None value rather than a set for deterministic
+    # iteration order just in case.
+    user_visible_outputs: Optional[Dict[str, None]] = None,
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    if (sleep_sec := config.sleep_sec_TESTING_ONLY) is not None:
+        import time
+        log.warning("Sleeping for %s since sleep_sec_TESTING_ONLY is set", sleep_sec)
+        time.sleep(sleep_sec)
+    with dynamo_utils.preserve_rng_state():
+        if is_tf32_warning_applicable(gm):
+            _warn_tf32_disabled()
+        inductor_counters = counters["inductor"].copy()
+        # lift the maximum depth of the Python interpreter stack
+        # to adapt large/deep models
+        sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000))
+        _step_logger()(
+            logging.INFO,
+            "torchinductor compiling "
+            f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+            f"graph {graph_id}",
+        )
+        def log_graph_runnable():
+            fd = io.StringIO()
+            torch._dynamo.repro.after_aot.save_graph_repro(
+                fd, gm, example_inputs, "inductor", save_dir=None
+            )
+            return fd.getvalue()
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_graph_runnable",
+                "encoding": "string",
+            },
+            payload_fn=lambda: log_graph_runnable(),
+        )
+        V.debug.fx_graph(gm, example_inputs)
+        # TODO: Should we actually dump this?  It should be redundant with the aot
+        # structured logs...
+        # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
+        shape_env = shape_env_from_inputs(example_inputs)
+        # Convert view to reshape in the graph. This is necessary primarily for
+        # layout optimization. Do it unconditionally for uniformity.
+        #
+        # It's needed because when we do layout optimization, an contiguous tensor
+        # in eager mode may becomes a channels last tensor. A view op previously
+        # can be applied to the contiguous tensor may not be able to be applied
+        # on the channels tensor any more. An error like
+        #   RuntimeError: view size is not compatible with input tensor's size and stride
+        #   (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+        # will be printed.
+        #
+        # Replace view op to reshape op in this case.
+        # As an example, timm_resnest/botnet26t_256/convnext_base etc. will fail if we don't do this.
+        #
+        # Also this has to be done before FakeTensorProp below to avoid the failed
+        # .view() call.
+        view_to_reshape(gm)
+        # It is safe to run FakeTensorProp under no_grad because by the time
+        # we're in inductor, we assume that AOTAutograd has already "taken care"
+        # of autograd, so there should be no more autograd-related API's in the
+        # graph.
+        with torch.no_grad():
+            fake_mode = fake_tensor_prop(gm, example_inputs)
+        # pattern matcher passes might not preserve striding information
+        # on node.meta["val"]. if in the future we rely on these being
+        # correct we will need to fix.
+        with V.set_fake_mode(fake_mode):
+            # has some issues with memory in training
+            _recursive_post_grad_passes(gm, is_inference=is_inference)
+            V.debug.fx_graph_transformed(gm, example_inputs)
+            post_grad_graphs_log.debug(
+                "%s",
+                lazy_format_graph_code(
+                    "AFTER POST GRAD",
+                    gm,
+                    include_stride=True,
+                    include_device=True,
+                    colored=True,
+                ),
+            )
+            trace_structured(
+                "inductor_post_grad_graph",
+                payload_fn=lambda: gm.print_readable(
+                    print_output=False, include_stride=True, include_device=True
+                ),
+            )
+            if config.is_fbcode():
+                log_optimus_to_scuba(
+                    extra_logging={"pt2_configs": str(get_patched_config_dict())}
+                )
+        with V.set_fake_mode(fake_mode), maybe_disable_comprehensive_padding(
+            example_inputs
+        ):
+            const_output_index = None
+            const_graph = None
+            const_code = None
+            if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+                const_gm, const_output_index = split_const_gm(gm)
+                const_graph = GraphLowering(
+                    const_gm,
+                    example_inputs=[],
+                    shape_env=shape_env,
+                    graph_id=graph_id,
+                    cpp_wrapper=cpp_wrapper,
+                    aot_mode=aot_mode,
+                    user_visible_outputs=user_visible_outputs,
+                    extern_node_serializer=extern_node_serializer,
+                    is_inference=is_inference,
+                    is_const_graph=True,
+                )
+                with V.set_graph_handler(const_graph):
+                    assert cpp_wrapper, "AOT mode only supports C++ wrapper"
+                    const_graph.run()
+                    const_code, _ = const_graph.codegen_with_cpp_wrapper()
+            graph = GraphLowering(
+                gm,
+                # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
+                # For the forward pass, we have the real inputs to be used as example_inputs. For the backward pass,
+                # we currently use fake tensors and defake them later.
+                example_inputs=example_inputs,
+                shape_env=shape_env,
+                graph_id=graph_id,
+                cpp_wrapper=cpp_wrapper,
+                aot_mode=aot_mode,
+                user_visible_outputs=user_visible_outputs,
+                extern_node_serializer=extern_node_serializer,
+                is_inference=is_inference,
+                const_output_index=const_output_index,
+                const_code=const_code,
+                const_module=const_graph,
+            )
+            metrics_helper = metrics.CachedMetricsHelper()
+            with V.set_graph_handler(graph):
+                graph.run(*example_inputs)
+                output_strides: List[Optional[Tuple[_StrideExprStr, ...]]] = []
+                if graph.graph_outputs is not None:
+                    # We'll put the output strides in the compiled graph so we
+                    # can later return them to the caller via TracingContext
+                    p = SymExprPrinter()
+                    for out in graph.graph_outputs:
+                        if (
+                            hasattr(out, "layout")
+                            and len(free_unbacked_symbols(out.layout.stride)) == 0
+                        ):
+                            # Convert to string for eval on the load path
+                            output_strides.append(
+                                tuple(p.doprint(s) for s in out.layout.stride)
+                            )
+                        else:
+                            output_strides.append(None)
+                _check_triton_bf16_support(graph)
+                compiled_fn = graph.compile_to_fn()
+                num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+                metrics.num_bytes_accessed += num_bytes
+                metrics.node_runtimes += node_runtimes
+                metrics.nodes_num_elem += nodes_num_elem
+                if (
+                    cudagraphs
+                    and config.triton.cudagraph_skip_dynamic_graphs
+                    and not V.graph.disable_cudagraphs_reason
+                    and torch._inductor.utils.any_is_symbolic(*example_inputs)
+                ):
+                    stack_trace = None
+                    for node in gm.graph.nodes:
+                        meta_val = node.meta.get("val", None)
+                        if (
+                            node.op == "placeholder"
+                            or not isinstance(meta_val, torch.Tensor)
+                            or not torch._inductor.utils.any_is_symbolic(meta_val)
+                        ):
+                            continue
+                        if stack_trace := node.meta.get("stack_trace", None):
+                            break
+                    disable = "graph with symbolic shapes inputs and config.triton.cudagraph_skip_dynamic_graphs=True."
+                    if stack_trace:
+                        disable = f"{disable} Found from {stack_trace}\n"
+                    else:
+                        disable = f"{disable}\n"
+                    V.graph.disable_cudagraphs_reason = disable
+                if V.aot_compilation is True:
+                    return compiled_fn
+                if cudagraphs and not V.graph.disable_cudagraphs_reason:
+                    from torch._inductor.cudagraph_utils import (
+                        check_lowering_disable_cudagraph,
+                    )
+                    V.graph.disable_cudagraphs_reason = (
+                        check_lowering_disable_cudagraph(V.graph.device_node_mapping)
+                    )
+                compiled_graph = CompiledFxGraph(
+                    compiled_fn,
+                    graph,
+                    output_strides,
+                    V.graph.disable_cudagraphs_reason,
+                    metrics_helper.get_deltas(),
+                    counters["inductor"] - inductor_counters,
+                )
+        return compiled_graph
+def get_input_idxs_to_check(
+    inputs: List[InputType],
+    static_input_idxs: Sequence[int],
+) -> Sequence[int]:
+    """
+    This function runs at compile time, and generates a list of indices for which we
+    might need to do a copy to preserve alignment requirements.
+    """
+    ids_to_check = []
+    for i, input in enumerate(inputs):
+        if not isinstance(input, torch.Tensor):
+            # non-tensors don't need alignment
+            continue
+        if not is_gpu(input.device.type):
+            # right now we only care for gpu tensors
+            continue
+        with maybe_get_suppress_shape_guards_ctx():
+            # suppress guards so that tensor_is_aligned and should_assume_input_aligned
+            # do not add guards on input's storage offset
+            if i in static_input_idxs and tensor_is_aligned(input):
+                continue
+            if not should_assume_input_aligned(input):
+                continue
+        # if we get here, then
+        # (a) our triton code assumes that the input is aligned
+        # (b) we can't be sure ahead of time that the input will actually be aligned.
+        # therefore, at runtime, we'll need to check that the input is aligned
+        # (and if not, clone it to make it aligned.)
+        ids_to_check.append(i)
+    return ids_to_check
+def cudagraphify(
+    model: Callable[..., Any],
+    static_input_idxs: Sequence[int] = (),
+    *,
+    device_index: int,
+    stack_traces: List[Optional[str]],
+    is_backward: bool,
+    is_inference: bool,
+    constants: Tuple[torch.Tensor, ...] = (),
+    placeholders: Sequence[PlaceholderInfo] = (),
+    mutated_input_idxs: Tuple[int, ...] = (),
+) -> Callable[..., Any]:
+    from torch._inductor.cudagraph_trees import (
+        cudagraphify_impl as new_cudagraphify_impl,
+    )
+    cudagraphify_fn: Callable[..., Any]
+    if config.triton.cudagraph_trees:
+        cudagraphify_fn = functools.partial(
+            new_cudagraphify_impl,
+            device_index=device_index,
+            stack_traces=stack_traces,
+            is_backward=is_backward,
+            is_inference=is_inference,
+            constants=constants,
+            placeholders=placeholders,
+            mutated_input_idxs=mutated_input_idxs,
+        )
+    else:
+        cudagraphify_fn = cudagraphify_impl
+    compiled_fn = None
+    def run(new_inputs):
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            with dynamo_utils.dynamo_timed(
+                "cudagraphify"
+            ), dynamo_utils.preserve_rng_state():
+                compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)
+        return compiled_fn(new_inputs)
+    return run
+def static_input(x: torch.Tensor) -> torch.Tensor:
+    """
+    Copy and input while preserving strides
+    """
+    return torch.empty_strided(x.size(), x.stride(), dtype=x.dtype, device=x.device)
+def index_expanded_dims_and_copy_(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+    expanded_dims: List[int],
+):
+    "Index into expanded dimensions of both dst and src then copy_"
+    dst = index_expanded_dims(dst, expanded_dims)
+    src = index_expanded_dims(src, expanded_dims)
+    dst.copy_(src)
+def cudagraphify_impl(
+    model: Callable[..., Any],
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    """
+    Assumes inputs[static_input_idxs[i]] are always the same memory address
+    """
+    check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)  # type: ignore[arg-type]
+    static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs)  # type: ignore[arg-type]
+    copy_misaligned_inputs(inputs, check_input_idxs)  # type: ignore[arg-type]
+    assert isinstance(inputs, list)
+    inps_expanded_dims = [
+        get_expanded_dims(x) if idx not in static_input_idxs else []
+        for idx, x in enumerate(inputs)
+    ]
+    # allocate static tensor inputs
+    static_inputs = [
+        x
+        if not isinstance(x, torch.Tensor)
+        else static_input(x)
+        if idx not in static_input_idxs
+        else x.detach()
+        for idx, x in enumerate(inputs)
+    ]
+    # copy over input values for fresh allocations
+    for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)):
+        if isinstance(x, torch.Tensor) and idx not in static_input_idxs:
+            index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims)
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    # copy static_inputs because it will be cleared in model
+    with torch.cuda.stream(stream):
+        model(list(static_inputs))
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream, capture_error_mode="thread_local"):
+        static_outputs = model(list(static_inputs))
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+    if config.size_asserts:
+        def run(new_inputs):
+            assert len(static_inputs) == len(new_inputs)
+            for idx, (dst, src, expanded_dims) in enumerate(
+                zip(static_inputs, new_inputs, inps_expanded_dims)
+            ):
+                if not isinstance(dst, torch.Tensor):
+                    pass
+                elif idx in static_input_idxs:
+                    assert dst.data_ptr() == src.data_ptr()
+                else:
+                    # TODO - could make one single op of multiple slices
+                    # and avoid dispatch.
+                    # Could also pre-index the `dst` tensors
+                    index_expanded_dims_and_copy_(dst, src, expanded_dims)
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+    else:
+        copy_indices = [
+            idx for idx in range(len(static_inputs)) if idx not in static_input_idxs
+        ]
+        def run(new_inputs):
+            for idx in copy_indices:
+                expanded_dims = inps_expanded_dims[idx]
+                index_expanded_dims_and_copy_(
+                    static_inputs[idx], new_inputs[idx], expanded_dims
+                )
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+    return align_inputs_from_check_idxs(run, check_input_idxs)
+def compile_fx_aot(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+):
+    config_patches: Dict[str, Any] = (
+        {"cpp_wrapper": True}
+        if config_patches is None
+        else {**config_patches, "cpp_wrapper": True}
+    )
+    if (
+        "aot_inductor.output_path" not in config_patches
+        and not config.aot_inductor.output_path
+    ):
+        config_patches = {
+            **config_patches,
+            "aot_inductor.output_path": code_hash(model_.code),
+        }
+    extern_node_serializer = config_patches.pop("extern_node_serializer", None)
+    with V.set_aot_compilation(True):
+        compiled_lib_path = compile_fx(
+            model_,
+            example_inputs_,
+            inner_compile=functools.partial(
+                inner_compile,
+                aot_mode=True,
+                extern_node_serializer=extern_node_serializer,
+            ),
+            config_patches=config_patches,
+        )
+        assert os.path.exists(
+            compiled_lib_path
+        ), f"AOTInductor compiled library does not exist at {compiled_lib_path}"
+        return compiled_lib_path
+_graph_counter = count(0)
+def fw_compiler_freezing(
+    aot_autograd_model: torch.fx.GraphModule,
+    aot_example_inputs: List[torch.Tensor],
+    dynamo_model: torch.fx.GraphModule,
+    num_example_inputs: int,
+    inner_compile: Callable[..., Any],
+    cudagraphs: BoxedBool,
+    graph_id: int,
+    forward_device: BoxedDeviceIndex,
+):
+    from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze
+    # partition_fn won't be called
+    _recursive_joint_graph_passes(aot_autograd_model)
+    layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True)
+    if layout_opt:
+        # make sure meta['val'] is properly setup
+        fake_tensor_prop(aot_autograd_model, aot_example_inputs, True)
+        convert_conv_weights_to_channels_last(aot_autograd_model)
+    opt_model, preserved_arg_indices = freeze(
+        dynamo_model,
+        aot_autograd_model,
+        aot_example_inputs,  # type: ignore[arg-type]
+    )
+    aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices]
+    num_fixed = len(preserved_arg_indices) - num_example_inputs
+    fake_mode = detect_fake_mode(aot_example_inputs)
+    # for freezing, all graph outputs should be user visible
+    *_, model_outputs_node = opt_model.graph.nodes
+    model_outputs = model_outputs_node.args[0]
+    user_visible_outputs = dict.fromkeys(
+        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+    )
+    static_input_idxs = list(range(num_fixed))
+    # constant params will be real tensors, not fake
+    tracing_context = torch._guards.TracingContext.try_get()
+    if tracing_context is not None:
+        params_flat = tracing_context.params_flat
+        assert params_flat is not None
+        for i in range(len(params_flat)):
+            if i not in preserved_arg_indices:
+                params_flat[i] = None
+        if tracing_context.fw_metadata:
+            static_input_idxs += tracing_context.fw_metadata.static_input_indices
+    with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
+        optimized_function = inner_compile(
+            opt_model,
+            aot_example_inputs,
+            static_input_idxs=static_input_idxs,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=True,
+            boxed_forward_device_index=forward_device,
+            layout_opt=layout_opt,
+            user_visible_outputs=user_visible_outputs,
+        )
+    # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper
+    # that drops constant-ified params
+    if V.aot_compilation is True:
+        return optimized_function
+    def wrapper(args):
+        args_new = [args[i] for i in preserved_arg_indices]
+        args.clear()
+        return optimized_function(args_new)
+    wrapper._boxed_call = True  # type: ignore[attr-defined]
+    return wrapper
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+    decompositions: Optional[Dict[OpOverload, Callable[..., Any]]] = None,
+):
+    with _use_lazy_graph_module(dynamo_config.use_lazy_graph_module):
+        """Main entrypoint to a compile given FX graph"""
+        if config_patches:
+            with config.patch(config_patches):
+                return compile_fx(
+                    model_,
+                    example_inputs_,
+                    # need extra layer of patching as backwards is compiled out of scope
+                    inner_compile=config.patch(config_patches)(inner_compile),
+                    decompositions=decompositions,
+                )
+        if config.cpp_wrapper:
+            with config.patch(
+                {
+                    "cpp_wrapper": False,
+                    # For triton.autotune_at_compile_time, disable by default for
+                    # FBCode, but enabled by default for OSS.
+                    "triton.autotune_at_compile_time": config.triton.autotune_at_compile_time
+                    if config.is_fbcode()
+                    else os.environ.get(
+                        "TORCHINDUCTOR_TRITON_AUTOTUNE_AT_COMPILE_TIME", "1"
+                    )
+                    == "1",
+                    "triton.autotune_cublasLt": False,
+                    "triton.cudagraphs": False,
+                    "triton.store_cubin": True,
+                }
+            ), V.set_real_inputs(example_inputs_):
+                inputs_ = example_inputs_
+                if isinstance(model_, torch.fx.GraphModule):
+                    fake_inputs = [
+                        node.meta.get("val")
+                        for node in model_.graph.nodes
+                        if node.op == "placeholder"
+                    ]
+                    if all(v is not None for v in fake_inputs):
+                        # Validate devices before switching to fake tensors.
+                        for idx, fi, i in zip(count(), fake_inputs, inputs_):
+                            if fi.device != i.device:
+                                raise ValueError(
+                                    f"Device mismatch between fake input and example input at position #{idx}: "
+                                    f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                                    "make sure torch.export() and torch.aot_compile() run on the same device."
+                                )
+                        inputs_ = fake_inputs
+                return compile_fx(
+                    model_,
+                    inputs_,
+                    inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                    decompositions=decompositions,
+                )
+        recursive_compile_fx = functools.partial(
+            compile_fx,
+            inner_compile=inner_compile,
+            decompositions=decompositions,
+        )
+        if not graph_returns_tuple(model_):
+            return make_graph_return_tuple(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+        if isinstance(model_, torch.fx.GraphModule):
+            if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+                # this graph is the result of dynamo.export()
+                return handle_dynamo_export_graph(
+                    model_,
+                    example_inputs_,
+                    recursive_compile_fx,
+                )
+            model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+        if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+            return flatten_graph_inputs(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+        assert not config._raise_error_for_testing
+        num_example_inputs = len(example_inputs_)
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+        forward_device = BoxedDeviceIndex(None)
+        graph_id = next(_graph_counter)
+        decompositions = (
+            decompositions if decompositions is not None else select_decomp_table()
+        )
+        def fw_compiler_base(
+            model: torch.fx.GraphModule,
+            example_inputs: List[torch.Tensor],
+            is_inference: bool,
+        ):
+            with dynamo_utils.dynamo_timed("compile_fx.<locals>.fw_compiler_base"):
+                return _fw_compiler_base(model, example_inputs, is_inference)
+        def _fw_compiler_base(
+            model: torch.fx.GraphModule,
+            example_inputs: List[torch.Tensor],
+            is_inference: bool,
+        ):
+            if is_inference:
+                # partition_fn won't be called
+                _recursive_joint_graph_passes(model)
+            fixed = torch._inductor.utils.num_fw_fixed_arguments(
+                num_example_inputs, len(example_inputs)
+            )
+            user_visible_outputs = {}
+            if config.keep_output_stride:
+                model_outputs_node = output_node(model)
+                model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+                num_model_outputs = len(model_outputs)
+                context = torch._guards.TracingContext.try_get()
+                # See Note [User Outputs in the inductor graph]
+                if context is not None and context.fw_metadata and not is_inference:
+                    original_output_start_index = (
+                        context.fw_metadata.num_mutated_inp_runtime_indices
+                    )
+                else:
+                    original_output_start_index = 0
+                if isinstance(model_, torch.fx.GraphModule):
+                    *_, orig_model_outputs_node = model_.graph.nodes
+                    assert orig_model_outputs_node.op == "output"
+                    orig_model_outputs, _ = pytree.tree_flatten(
+                        orig_model_outputs_node.args
+                    )
+                    num_orig_model_outputs = len(orig_model_outputs)
+                else:
+                    num_orig_model_outputs = num_model_outputs
+                assert num_orig_model_outputs <= num_model_outputs
+                # Note [User Outputs in the inductor graph]
+                # We makes the following assumption
+                # For inference
+                #   len(orig_model_outputs) == len(model_outputs)
+                # For training
+                #   len(orig_model_outputs) <= len(model_outputs)
+                # During training, most of the time the model_outputs starts with
+                # original module's outputs followed by saved activations.
+                # But this can be not true if the model have inplace updated tensors.
+                # AOTAutograd will make those tensors being returned before the original
+                # module's output.
+                # To make things safe, we'll use original_output_start_index field
+                # set by AOTAutograd to decide where the original module outputs start.
+                orig_output_end_idx = (
+                    original_output_start_index + num_orig_model_outputs
+                )
+                # Sanity chec: we are about to splice out the "user" outputs from the full set
+                # of "graph" outputs. Make sure we're within bounds.
+                assert orig_output_end_idx <= num_model_outputs
+                user_visible_outputs = dict.fromkeys(
+                    n.name
+                    for n in model_outputs[
+                        original_output_start_index:orig_output_end_idx
+                    ]
+                    if isinstance(n, torch.fx.Node)
+                )
+            return inner_compile(
+                model,
+                example_inputs,
+                static_input_idxs=get_static_input_idxs(fixed),
+                cudagraphs=cudagraphs,
+                graph_id=graph_id,
+                is_inference=is_inference,
+                boxed_forward_device_index=forward_device,
+                user_visible_outputs=user_visible_outputs,
+            )
+        fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
+        if config.freezing and not torch.is_grad_enabled():
+            inference_compiler = functools.partial(
+                fw_compiler_freezing,
+                dynamo_model=model_,
+                num_example_inputs=num_example_inputs,
+                inner_compile=inner_compile,
+                cudagraphs=cudagraphs,
+                graph_id=graph_id,
+                forward_device=forward_device,
+            )
+        else:
+            inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
+        def partition_fn(graph, joint_inputs, **kwargs):
+            _recursive_joint_graph_passes(graph)
+            return min_cut_rematerialization_partition(
+                graph, joint_inputs, **kwargs, compiler="inductor"
+            )
+        def bw_compiler(
+            model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]
+        ):
+            with dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"):
+                user_visible_outputs = {}
+                if config.bw_outputs_user_visible:
+                    model_outputs_node = output_node(model)
+                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+                    user_visible_outputs = dict.fromkeys(
+                        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+                    )
+                fixed = count_tangents(model)
+                return inner_compile(
+                    model,
+                    example_inputs,
+                    static_input_idxs=list(range(fixed)),
+                    cudagraphs=cudagraphs,
+                    is_backward=True,
+                    graph_id=graph_id,
+                    boxed_forward_device_index=forward_device,
+                    user_visible_outputs=user_visible_outputs,
+                )
+        # TODO: can add logging before/after the call to create_aot_dispatcher_function
+        # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+        # once torchdynamo is merged into pytorch
+        fake_mode = detect_fake_mode(
+            example_inputs_
+        ) or torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        tracing_context = (
+            torch._guards.TracingContext.try_get()
+            or torch._guards.TracingContext(fake_mode)
+        )
+        if V.aot_compilation is True:
+            with functorch_config.patch(unlift_effect_tokens=True):
+                gm, graph_signature = aot_export_module(
+                    model_,
+                    example_inputs_,
+                    trace_joint=False,
+                    decompositions=decompositions,
+                )
+            unlifted_gm = _unlift_graph(model_, gm, graph_signature)
+            if "dynamo_flat_name_to_original_fqn" in model_.meta:
+                unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[
+                    "dynamo_flat_name_to_original_fqn"
+                ]
+            # Disable amp as in aot_dispatch_autograd (https://github.com/pytorch/pytorch/pull/86515)
+            # In inference_compiler (fw_compiler_base), _recursive_joint_graph_passes will call into
+            # _sfdp_init() to register patterns.
+            # When fallback_random is set to True, the sdpa patterns will be traced during runtime.
+            # If amp is turned on, the traced FP32 patterns will have prims.convert_element_type which
+            # will be the same as the generated FP16 patterns.
+            disable_amp = torch._C._is_any_autocast_enabled()
+            context = (
+                torch._C._DisableAutocast if disable_amp else contextlib.nullcontext
+            )
+            with V.set_fake_mode(fake_mode), compiled_autograd.disable(), context():
+                return inference_compiler(unlifted_gm, example_inputs_)
+        with V.set_fake_mode(fake_mode), torch._guards.tracing(
+            tracing_context
+        ), compiled_autograd.disable(), functorch_config.patch(
+            unlift_effect_tokens=True
+        ):
+            return aot_autograd(
+                fw_compiler=fw_compiler,
+                bw_compiler=bw_compiler,
+                inference_compiler=inference_compiler,
+                decompositions=decompositions,
+                partition_fn=partition_fn,
+                keep_inference_input_mutations=True,
+                cudagraphs=cudagraphs,
+            )(model_, example_inputs_)
+def graph_returns_tuple(gm: torch.fx.GraphModule):
+    """True if a FX graph returns a tuple"""
+    if not isinstance(gm, torch.fx.GraphModule):
+        return True  # can't check this, assume true
+    (rv,) = output_node(gm).args
+    if isinstance(rv, (list, tuple)):
+        return True
+    if (
+        isinstance(rv, torch.fx.node.Node)
+        and hasattr(rv.target, "_schema")
+        and len(rv.target._schema.returns) > 1
+        and all(str(ret.type) == "Tensor" for ret in rv.target._schema.returns)
+    ):
+        # for graphs whose result is one node with multiple outputs
+        return True
+    return False
+def make_graph_return_tuple(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    Mutate gm so it returns a tuple.  This is only needed for graphs
+    not created by torchdynamo that return non-tuples.
+    """
+    node = output_node(gm)
+    (rv,) = node.args
+    rv, spec = pytree.tree_flatten(rv)
+    with gm.graph.inserting_before(node):
+        gm.graph.output(rv)
+    gm.graph.erase_node(node)
+    assert graph_returns_tuple(gm)
+    compiled_fn = compile_gm(gm, inputs)
+    @functools.wraps(compiled_fn)
+    def wrapper(*args, **kwargs):
+        return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec)
+    return wrapper
+def handle_dynamo_export_graph(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
+    convert that to a normal FX graph so inductor can compile it.
+    """
+    codegen = gm.graph._codegen
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.recompile()
+    compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs))
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
+    return wrapper
+def _check_triton_bf16_support(graph: GraphLowering) -> None:
+    def warn_and_skip(device) -> None:
+        from torch._dynamo.exc import SkipFrame
+        device_interface = get_interface_for_device(device.type)
+        device_props = device_interface.get_device_properties(device)
+        warnings.warn(
+            f"{device_props.name} does not support bfloat16 compilation natively, skipping"
+        )
+        raise SkipFrame("BF16 is not supported")
+    for inp in graph.graph_inputs.values():
+        device = getattr(inp, "get_device", lambda: torch.device("meta"))()
+        if (not is_gpu(device.type)) or inp.get_dtype() != torch.bfloat16:
+            continue
+        # Print warning and skip frame if attempting to compile for bfloat16
+        # on device without hardware support for dtype
+        device_interface = get_interface_for_device(device.type)
+        if device_interface.is_bf16_supported(including_emulation=False):
+            return
+        warn_and_skip(device)
+    for out in graph.graph_outputs:
+        device = getattr(out, "get_device", lambda: torch.device("meta"))()
+        if (not is_gpu(device.type)) or out.get_dtype() != torch.bfloat16:
+            continue
+        # Print warning and skip frame if attempting to compile for bfloat16
+        # on device without hardware support for dtype
+        device_interface = get_interface_for_device(device.type)
+        if device_interface.is_bf16_supported(including_emulation=False):
+            return
+        warn_and_skip(device)

.venv/lib/python3.11/site-packages/torch/_inductor/config.py ADDED Viewed

	@@ -0,0 +1,1241 @@

+import os  # noqa: C101
+import sys
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union
+import torch
+def is_fbcode() -> bool:
+    return not hasattr(torch.version, "git_version")
+def fx_graph_remote_cache_default() -> Optional[bool]:
+    if os.environ.get("TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE") == "1":
+        return True
+    if os.environ.get("TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE") == "0":
+        return False
+    return None
+def autotune_remote_cache_default() -> Optional[bool]:
+    if os.environ.get("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1":
+        return True
+    if os.environ.get("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE") == "0":
+        return False
+    return None
+# Enable auto_functionalized_v2 (enabled by default)
+enable_auto_functionalized_v2 = (
+    os.environ.get("TORCHDYNAMO_AUTO_FUNCTIONALIZED_V2", "0") == "1"
+)
+# add some debug printouts
+debug = False
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+# Whether to enable printing the source code for each future
+verbose_progress = False
+# use fx aot graph codegen cache
+fx_graph_cache = (
+    os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE", "0" if is_fbcode() else "1") == "1"
+)
+# use remote fx aot graph codegen cache
+# False: Disables the cache
+# True: Enables the cache
+# None: Not set -- Off for OSS, JustKnobs based for internal
+fx_graph_remote_cache: Optional[bool] = fx_graph_remote_cache_default()
+# enable autotune local cache
+autotune_local_cache = True
+# enable autotune remote cache
+# False: Disables the cache
+# True: Enables the cache
+# None: Not set -- Off for OSS, JustKnobs based for internal
+autotune_remote_cache: Optional[bool] = autotune_remote_cache_default()
+# Force disabled all inductor level caching -- This will override any other caching flag
+force_disable_caches = os.environ.get("TORCHINDUCTOR_FORCE_DISABLE_CACHES") == "1"
+# sleep in inductor for testing
+sleep_sec_TESTING_ONLY: Optional[int] = None
+# The default layout constraint for custom operators.
+# This must be the name of one of the layout constraint tags
+# (that is, one of {"needs_fixed_stride_order", "flexible_layout"}),
+# If the custom op does not have a layout constraint tag already
+# then we assume the following applies.
+custom_op_default_layout_constraint = "flexible_layout"
+# use cpp wrapper instead of python wrapper
+cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
+# codegen cpp wrapper code in an ABI compatible mode
+abi_compatible = (
+    os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
+)
+c_shim_version = os.environ.get("TORCHINDUCTOR_C_SHIM_VERSION", "2")
+# dead code elimination
+dce = False
+# assume weight tensors are fixed size
+static_weight_shapes = True
+# put correctness assertions in generated code
+size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1"
+nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1"
+# enable loop reordering based on input orders
+pick_loop_orders = True
+# reuse a kernel input as the output
+inplace_buffers = True
+# reuse a buffer for an unrelated purpose
+allow_buffer_reuse = True
+# Enable pooled allocations for non-output tensors
+memory_planning = os.environ.get("TORCHINDUCTOR_MEMORY_PLANNING", "0") == "1"
+# How to organize memory under memory_planning=True:
+# - "none": do not try to pool storage, just reuse
+# - "intermediates": all non-outputs share storage, outputs each get unique storage
+# - "outputs": two pools, one for intermediates (freed on return) and one for outputs
+# - "combined": a single pool for both intermediates and outputs
+memory_pool = os.environ.get("TORCHINDUCTOR_MEMORY_POOL", "intermediates")
+# codegen benchmark harness
+benchmark_harness = True
+# fuse pointwise into templates
+epilogue_fusion = True
+# do epilogue fusions before other fusions
+epilogue_fusion_first = False
+# enable pattern match+replace optimizations
+pattern_matcher = True
+# set to True to enable the back-to-back GEMM pass
+b2b_gemm_pass = False
+# register custom graph optimization pass hook. so far, pre/post passes are
+# only applied before/after pattern_matcher in post_grad_passes.
+#
+# def my_custom_pre_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# def my_custom_post_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass
+# torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass
+post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+# Registers a custom joint graph pass.
+joint_custom_pre_pass: Optional[Callable[[torch.fx.Graph], None]] = None
+joint_custom_post_pass: Optional[Callable[[torch.fx.Graph], None]] = None
+# Registers a custom pregrad pass. Note that the pre-grad IR is 1.
+# non-functional, 2. non-normalized, and 3. prone to change. Ideally we should
+# use post-grad passes.
+pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+# Registers a custom pass to be run right before fusion in Inductor scheduler.
+# WARNING: Inductor scheduler IR is at prototype stage and subject to change,
+# hence custom IR passes built on top of it might break in the future.
+_pre_fusion_custom_pass: Optional[
+    Callable[
+        [List["torch._inductor.scheduler.BaseSchedulerNode"]],
+        List["torch._inductor.scheduler.BaseSchedulerNode"],
+    ]
+] = None
+# Deprecated
+split_cat_fx_passes = True
+# Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability.
+efficient_conv_bn_eval_fx_passes = False
+# Enable predispatch aten IR for export
+is_predispatch = False
+# Deprecated
+group_fusion = False
+# Deprecated
+batch_fusion = True
+# Pre grad fusion and options in order, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions.
+# batch fusion options:
+# batch_linear
+# batch_linear_lhs
+# batch_layernorm
+# batch_tanh
+# batch_relu
+# batch_sigmoid
+# split cat fusion options:
+# normalization_pass
+# remove_split_with_size_one_pass
+# merge_getitem_cat_pass
+# merge_stack_tahn_unbind
+# merge_splits_pass
+# mutate_cat_pass
+# split_cat_pass
+pre_grad_fusion_options: Dict[str, Dict[str, Any]] = {
+    "batch_linear": {},
+    "batch_linear_lhs": {},
+    "batch_layernorm": {},
+    "batch_tanh": {},
+    "batch_relu": {},
+    "batch_sigmoid": {},
+}
+# Post grad fusion and options, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions.
+post_grad_fusion_options: Dict[str, Dict[str, Any]] = {}
+# enable reordering pass for improving memory locality
+reorder_for_locality = True
+# Scale down RBLOCK for better occupancy
+dynamic_scale_rblock = os.environ.get("TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCK", "1") == "1"
+# this forces fusion for int_mm with mul. Needed when you want to avoid realizing the int32
+# but the mul gets fused with other pointwise ops instead.
+force_fuse_int_mm_with_mul = False
+# for pattern torch.mm(a, b.to(dtype)) with cuda tensors,
+# enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel.
+# Autotune will compare perf with normal cast->then->mm option
+use_mixed_mm = True
+# enable runtime numeric check for pre/post grad fx passes
+# floating point provides limited accuracy (about 7 decimal digits for single precision
+# floating point numbers,about 16 decimal digits for double precision floating point numbers)
+# according to PyTorch documentation.
+# https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations
+fx_passes_numeric_check: Dict[str, Any] = {
+    "pre_grad": False,
+    "precision": 1e-4,
+    "num_iterations": 1,
+    "requires_optimizer": True,
+}
+# mixed_mm_choice can be used to control the behaviour for pattern torch.mm(a, b.to(dtype)) with cuda tensors.
+# The fallback aten implementation is normal cast->then->mm option.
+# If mixed_mm_choice is "default": this flag will be ignored.
+# If mixed_mm_choice is "triton":
+# - Always use torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel.
+# - Autotune will not compare with fallback.
+# If mixed_mm_choice is "aten": always use the fallback aten implementation.
+# If mixed_mm_choice is "heuristic":
+# - Enables the heuristic.
+# - If the heuristic decides to add a config, it will add the config as the first choice.
+# - If autotune is disabled, this config will always be chosen.
+# - If autotune is enabled, it will also compare with fallback aten implementation and fused kernel.
+# The use_mixed_mm flag will be ignored if mixed_mm_choice != "default".
+mixed_mm_choice = "heuristic"
+# enable reordering pass for increasing overlap between compute and communication
+reorder_for_compute_comm_overlap = False
+# passes (in execution order) for increasing overlap between compute and communication
+# for built-in passes, use string name; for user-defined passes, pass in the function handle
+# WARNING: Inductor scheduler IR is at prototype stage and subject to change,
+# hence custom IR passes built on top of it might break in the future.
+reorder_for_compute_comm_overlap_passes = [
+    "reorder_compute_for_overlap",
+    "sink_waits",
+    "raise_comms",
+]
+# runtime estimation function for ops
+# for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
+estimate_op_runtime = "default"
+# unit: GB/s, uni-directional P2P bandwidth per card
+# default value is NVLink
+intra_node_bw = 300
+# unit: GB/s, uni-directional P2P bandwidth per node
+# default value is InfiniBand
+inter_node_bw = 25
+# enable slow autotuning passes to select algorithms
+max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
+# enable slow autotuning passes to select pointwise/reductions algorithms
+max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1"
+# enable slow autotuning passes to select gemm algorithms
+max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
+# force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
+# when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
+# for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure
+# that triton does not use TF32 wherever cublas would not use TF32
+force_same_precision = (
+    True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1"
+)
+# Specify candidate backends for gemm autotune.
+# Possible choices are combinations of: ATen, Triton, CUTLASS, CK, CPP.
+# ATen: default Pytorch ATen kernels.
+# Triton: Triton templates defined in torch inductor (AMD and NVidia GPUs).
+# CUTLASS: Cutlass templates and kernels (NVidia GPUs only).
+# CK: Composable Kernel templates and kernels (AMD Instinct GPUs only).
+# CPP: CPP templates and kernels for CPU.
+max_autotune_gemm_backends = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON,CPP"
+).upper()
+# As above, specify candidate backends for conv autotune.
+# NB: in some cases for 1x1 convs we emit as matmul,
+# which will use the backends of `max_autotune_gemm_backends`
+max_autotune_conv_backends = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_CONV_BACKENDS", "ATEN,TRITON"
+).upper()
+# Specify the size of the search space for GEMM autotuning.
+# DEFAULT     - balance between compile time overhead and performance
+# EXHAUSTIVE  - maximize performance
+max_autotune_gemm_search_space = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT"
+).upper()
+# Whether we fall back to ATen or hard error when no matches are found during autotuning
+autotune_fallback_to_aten = (
+    os.environ.get("TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN", "1") == "1"
+)
+# the value used as a fallback for the unbacked SymInts
+# that can appear in the input shapes (e.g., in autotuning)
+unbacked_symint_fallback = 8192
+# DEPRECATED, DO NOT USE
+search_autotune_cache = False
+save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1"
+# We will disable creating subprocess for autotuning if this is False
+autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1"
+# The following three timeouts are applicable if autotune_in_subproc is True:
+# Max time that a a valid benchmark result may take during autotuning
+max_autotune_subproc_result_timeout_seconds = 60.0
+# Additional time we allow subprocesses to terminate gracefully after the timeout until we send a SIGTERM
+max_autotune_subproc_graceful_timeout_seconds = 1.0
+# Additional time that we grant after a SIGTERM until we do a hard SIGKILL of subprocesses
+max_autotune_subproc_terminate_timeout_seconds = 2.0
+# If autotuning in subprocess, whether to use multiple devices
+autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1"
+coordinate_descent_tuning = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1"
+)
+coordinate_descent_check_all_directions = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1"
+)
+coordinate_descent_search_radius = int(
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1")
+)
+# AutoHeuristic is a framework that allows one to collect data from autotuning, use the data to learn a heuristic, and
+# generate the learned heursitic to code which is shipped with the compiler
+# Specify a list of comma separated optimizations to collect data for
+autoheuristic_collect = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_COLLECT", "")
+# Specify a list of comma separated optimizations to use learned heuristics for
+autoheuristic_use = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_USE", "mixed_mm")
+def run_autoheuristic(name: str) -> bool:
+    return collect_autoheuristic(name) or use_autoheuristic(name)
+def collect_autoheuristic(name: str) -> bool:
+    return name in torch._inductor.config.autoheuristic_collect.split(",")
+def use_autoheuristic(name: str) -> bool:
+    return name in torch._inductor.config.autoheuristic_use.split(",")
+# If set to "DEFAULT", this will use the default log path specified in autoheuristic.py.
+# If set to another path, autoheuristic will instead log results to the given path.
+autoheuristic_log_path = os.environ.get(
+    "TORCHINDUCTOR_AUTOHEURISTIC_LOG_PATH", "DEFAULT"
+)
+# Disabled by default on ROCm, opt-in if model utilises NHWC convolutions
+layout_opt_default = "1" if not torch.version.hip else "0"
+layout_optimization = (
+    os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1"
+)
+force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1"
+# Whether to keep the output strides the same as eager after layout optimization.
+keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1"
+# Enabling this will let compiler print warning messages if a generated triton
+# kernel has inputs with mixed layouts.  This is helpful for perf debugging
+# since kernel with mixed layout inputs may run much slower then one whose inputs
+# have uniform layouts.
+warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1"
+# control store vs recompute heuristic
+# For fanouts, rematerialization can lead to exponential blowup. So, have
+# smaller threshold
+realize_reads_threshold = 4
+realize_opcount_threshold = 30
+# Threshold to prevent excessive accumulation of ops in one buffer during lowering
+realize_acc_reads_threshold = 8
+# fallback to eager for random/dropout, this is slow but useful for debugging
+fallback_random = False
+# automatically create fallbacks when encountering an unhandled op
+implicit_fallbacks = True
+# fuse even in cases without common reads
+aggressive_fusion = False
+# For each fused kernel in the wrapper, comment with the nodes that get fused.
+# Useful for debugging fusion.
+debug_fusion = os.environ.get("TORCHINDUCTOR_DEBUG_FUSION") == "1"
+benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
+enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
+loop_ordering_after_fusion = (
+    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
+)
+# For Triton Templates, select fastest of best template + epilogue vs best template + separate epilogue kernel
+benchmark_epilogue_fusion = (
+    os.environ.get("TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION", "1") == "1"
+)
+# Take how many of the top triton kernels to benchmark epilogue
+max_epilogue_benchmarked_choices = 1
+# how many nodes to allow into a single fusion
+max_fusion_size = 64
+# max number of inputs to generate cat as a pointwise op with masked laods
+max_pointwise_cat_inputs = 8
+# replace small reductions with pointwise, disable with `= 1`
+unroll_reductions_threshold = 8
+# Add extra comments to output code (causes compile cache misses)
+comment_origin = False
+# Convert 1x1 convs into matmuls
+conv_1x1_as_mm = False
+# Enable split reductions for better utilization when the dimension
+# being reduced over is large (by splitting it)
+split_reductions = True
+benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
+# Enable constant and index_expr folding
+constant_and_index_propagation = True
+# we always add constants into graph.constants without
+# performing any constant-inlining optimization
+always_keep_tensor_constants = False
+# assert that indirect indexing does not read / write out of bounds
+assert_indirect_indexing = True
+# compute CSE bounds on variables that do not appear in the FX graph
+compute_all_bounds = False
+# enable the combo kernel that combines data-independent kernels (additional
+# to foreach kernels) into a single one (Experimental)
+combo_kernels = False
+# benchmark combo kernels and only allow ones with perf gains
+benchmark_combo_kernel = False
+# combo_kernel autotuning options: 0 - disable, 1 - enable except for foreach,
+# 2 - enable for all
+combo_kernels_autotune = 1
+# Enable masking for combining kernels of mixed sizes: 0 - disable, 1 - enable
+# for all except for foreach, 2 - enable for all
+combo_kernel_allow_mixed_sizes = 1
+# Enable dynamic shapes for foreach kernels
+combo_kernel_foreach_dynamic_shapes = False
+# constant folding on the joint graph
+joint_graph_constant_folding = True
+# Enable indirect_indexing asserts for decompositions and lowerings
+debug_index_asserts = False
+# Mode to emulate pytorch eager numerics for lower precision (fp16, bf16)
+# Pytorch eager computes bf16/fp16 by upcasting inputs to fp32 and downcasting after
+# For multiple, fused pointwise nodes, inductor will elide the intermediary upcasts and downcasts
+# Typically this should be closer to fp64 ref numerics. However, it can be useful for debugging
+# to emulate the eager numerics.
+emulate_precision_casts = False
+# warnings intended for PyTorch developers, disable for point releases
+is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
+developer_warnings = is_fbcode() or is_nightly_or_source
+# This pattern matches a special usage of scatter
+# 1. It's applied to a constant tensor
+# 2. The index tensor has size 1 in the scatter dimension
+# Such pattern generates a sparse matrix when the const tensor is all-zero.
+# We can lower this pattern to a pointwise kernel for more fusion opportunities
+# and saving memory footprint.
+optimize_scatter_upon_const_tensor = (
+    os.environ.get("TORCHINDUCTOR_OPTIMIZE_SCATTER_UPON_CONST_TENSOR", "1") == "1"
+)
+# The multiprocessing start method to use for inductor workers in the codecache.
+# Can be "subprocess" or "fork".
+def decide_worker_start_method() -> str:
+    start_method = os.environ.get(
+        "TORCHINDUCTOR_WORKER_START", "fork" if is_fbcode() else "subprocess"
+    )
+    assert start_method in (
+        "subprocess",
+        "fork",
+    ), f"Invalid start method: {start_method}"
+    return start_method
+worker_start_method = decide_worker_start_method()
+# Flags to turn on all_reduce fusion. These 2 flags should be automaticaly turned
+# on by DDP and should not be set by the users.
+_fuse_ddp_communication = False
+_fuse_ddp_bucket_size = 25
+# Flag to control which fusion passes to apply. Functions in the list will
+# be applied in order. There are two different different fusion passes
+# --"fuse_ddp_with_concat_op" and "fuse_ddp_with_coalesced_op". The default
+# one is "fuse_ddp_with_concat_op". Users can also change this to a customized
+# fusion function.
+#
+# The fusion currently does not support multiple DDP with different PG or
+# data type. This feature will be added in the future PRs.
+#
+# "schedule_comm_wait" is used to delay the wait ops to maximize comm/comp
+# overlapping. At this moment, this pass performs better than
+# reorder_for_compute_comm_overlap_passes but we will add the logic of
+# "schedule_comm_wait" in the future and remove the one here.
+_fuse_ddp_communication_passes: List[Union[Callable[..., None], str]] = [
+    "fuse_ddp_with_concat_op",
+    "schedule_comm_wait",
+]
+_micro_pipeline_tp: bool = False
+def decide_compile_threads() -> int:
+    """
+    Here are the precedence to decide compile_threads
+    1. User can override it by TORCHINDUCTOR_COMPILE_THREADS.  One may want to disable async compiling by
+       setting this to 1 to make pdb happy.
+    2. Set to 1 if it's win32 platform
+    3. decide by the number of CPU cores
+    """
+    if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
+        return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
+    elif sys.platform == "win32":
+        return 1
+    elif is_fbcode():
+        return 1
+    else:
+        cpu_count = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else os.cpu_count()
+        )
+        assert cpu_count
+        return min(32, cpu_count)
+compile_threads = decide_compile_threads()
+# gemm autotuning global cache dir
+if is_fbcode():
+    try:
+        from libfb.py import parutil
+        if __package__:
+            global_cache_dir = parutil.get_dir_path(
+                os.path.join(__package__.replace(".", os.sep), "fb/cache")
+            )
+        else:
+            global_cache_dir = parutil.get_dir_path("fb/cache")
+    except (ValueError, ModuleNotFoundError):
+        global_cache_dir = None
+else:
+    global_cache_dir = None
+# If kernel is fused, the name is generated from the origin node op names
+# for larger kernels limit this
+kernel_name_max_ops = 10
+# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
+shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1"
+# Control if we will do padding for pointwise/reductions
+comprehensive_padding = (
+    os.environ.get("TORCHINDUCTOR_COMPREHENSIVE_PADDING", "1") == "1"
+)
+pad_channels_last = False
+# Disable comprehensive padding on the CPU
+disable_padding_cpu = True
+# The width of comprehensive padding, in bytes.
+# CUDA max memory transaction size is 128 bytes for a warp.
+padding_alignment_bytes = 128
+# Threshold on the minimum stride that will be padded.
+#
+# Don't align a too small stride since that causes too much memory increase.
+# Pad too small stride may also cause perf loss. We may result in many tiny data blocks
+# with gaps in between. That causes less coalesced GPU memory access!
+#
+# Initially we pick 320 as the threshold since for alignement=16,
+# that results in at most 5% memory cost.
+#
+# But later on we raise the threshold to 1024 to avoid interfere with persistent reduction.
+# Let's say an inner reduction has a row size 513. Inductor will generate
+# persistent reduction code.
+# If we do padding, the strides are not contiguous any more. Inductor
+# uses a much smaller threshold for persistent reduction in this case and
+# generates potentially worse non-persistent reduction code.
+#
+# This change turns HF AllenaiLongformerBase amp training from a loss of 1.09x to a win of 1.05x.
+# (baseline: 71.09ms, padding w/o this change: 77.38ms, padding with this change: 67.77ms)
+padding_stride_threshold = 1024
+# Enable padding outputs, even if they would not be padded in eager mode.
+# By default, we use the same strides as eager mode.
+pad_outputs = False
+# Whether to treat output of the backward graph as user visible.
+# For user visible outputs, inductor will make sure the stride matches with eager.
+bw_outputs_user_visible = True
+# Whether to always use shape padding if it is enabled and possible
+force_shape_pad: bool = False
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+# Mark the wrapper call in PyTorch profiler
+profiler_mark_wrapper_call = False
+# Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for
+# every intermediate for which we can correlate it with an intermediate
+# from the original FX graph
+generate_intermediate_hooks = False
+# Populate traceback field on IRNode; good for debugging why origin_node is
+# not populated, or finding out where an IRNode was constructed
+debug_ir_traceback = False
+# used for debugging to make sure config is properly set
+_raise_error_for_testing = False
+_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "")
+profile_bandwidth = _profile_var != ""
+profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var
+# Specify a file where we print out the profiling results.
+# None means we do not dump results to a file.
+profile_bandwidth_output = os.environ.get("TORCHINDUCTOR_PROFILE_OUTPUT", None)
+# Switch to do_bench_using_profiling to exclude the CPU overheads
+profile_bandwidth_with_do_bench_using_profiling = (
+    os.environ.get("TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILING") == "1"
+)
+# TODO: remove later
+disable_cpp_codegen = False
+# Freezing will attempt to inline weights as constants in optimization
+# and run constant folding and other optimizations on them. After freezing, weights
+# can no longer be updated.
+freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1"
+# Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead
+# of potentially keeping multiple copies of weights.
+freezing_discard_parameters: bool = False
+# Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests
+# should be run with this flag both on and off to make sure we have coverage.
+allow_stack_allocation: bool = (
+    os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1" if is_fbcode() else "0") == "1"
+)
+# Enables an alternate DSO interface (the "minimal ArrayRef interface") intended
+# to maximize performance for use cases that it can accommodate at the expense of
+# generality. In brief:
+# - inputs and outputs are ArrayRefTensor<T> (note that strides are required, but the
+#   tensor must be contiguous)
+# - constant handling is unchanged because it is not a per-inference-iteration bottleneck
+#
+# When the DSO is generated in this mode, the usual interface will also be supported,
+# but performance for that interface may be degraded.
+use_minimal_arrayref_interface: bool = False
+# decompose some memory bound matmul/bmm to mul
+decompose_mem_bound_mm: bool = False
+# assume_aligned_inputs means that we assume that inputs will be aligned; we generate
+# code using this assumption, and clone tensors before use if they aren't aligned.
+# In the common case, most inputs will be aligned.
+assume_aligned_inputs: bool = False
+# For the user-written Triton kernels compiled with the model, ignore the unsupported
+# arguments passed to the @triton.autotune in the user's code; this is unsafe, as
+# ignoring the unsupported args may lead to unexpected autotuning behavior: don't
+# set unless you know what you're doing.
+unsafe_ignore_unsupported_triton_autotune_args: bool = False
+# When True, we will check in scheduler.py _codegen that there are no "loops"
+# in the call stack; that is to say, the same frame multiple times.  This
+# ensures that a cProfile trace to this frame will be a straight line without
+# any cycles.
+check_stack_no_cycles_TESTING_ONLY: bool = False
+# config specific to codegen/cpp.py
+class cpp:
+    # set to torch.get_num_threads()
+    threads = -1
+    # Do not generate loops when the condition doesn't hold, like:
+    # for(long i0=4096; i0<4096; i0+=1)
+    no_redundant_loops = (
+        os.environ.get("TORCHINDUCTOR_CPP_NO_REDUNDANT_LOOPS", "1") == "1"
+    )
+    # Assume number of threads is dynamic, don't specialize thread number.
+    # Kernels don't recompile on thread number changes with this flag on.
+    # For single-threaded workload, turning it on would incur a slight
+    # performance degradation.
+    dynamic_threads = os.environ.get("TORCHINDUCTOR_CPP_DYNAMIC_THREADS", "0") == "1"
+    simdlen: Optional[int] = None
+    min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "4096"))
+    cxx = (
+        None,  # download gcc12 from conda-forge if conda is installed
+        # "g++-12",
+        # "g++-11",
+        # "g++-10",
+        # "clang++",
+        os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
+        # "g++.par",
+    )
+    # Allow kernel performance profiling via PyTorch profiler
+    enable_kernel_profile = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_KERNEL_PROFILE", "0") == "1"
+    )
+    # enable weight prepacking to get a better performance; may lead to large memory footprint
+    weight_prepack = os.environ.get("TORCHINDUCTOR_CPP_WEIGHT_PREPACK", "1") == "1"
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+    inject_log1p_bug_TESTING_ONLY: Optional[str] = None
+    # If None, autodetect whether or not AVX512/AVX2 can be used.  Otherwise,
+    # force usage as specified, without testing.
+    vec_isa_ok: Optional[bool] = None
+    # similar to config.triton.descriptive_names
+    descriptive_names = "original_aten"
+    # how many nodes to allow into a single horizontal fusion
+    max_horizontal_fusion_size = int(
+        os.environ.get("TORCHINDUCTOR_CPP_MAX_HORIZONTAL_FUSION_SIZE", "16")
+    )
+    # Make scatter_reduce fallback when reduce is sum to avoid performance regression
+    # using atomic_add.
+    fallback_scatter_reduce_sum = (
+        os.environ.get("TORCHINDUCTOR_CPP_FALLBACK_SCATTER_REDUCE_SUM", "1") == "1"
+    )
+    # Use funsafe-math-optimizations when compiling
+    enable_unsafe_math_opt_flag = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_UNSAFE_MATH_OPT_FLAG", "0") == "1"
+    )
+    # Use ffp-contract when compiling
+    enable_floating_point_contract_flag = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_FLOATING_POINT_CONTRACT_FLAG", "0")
+        == "1"
+    )
+    # Disable the tiling select heuristic
+    enable_tiling_heuristics = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_TILING_HEURISTIC", "1") == "1"
+    )
+    # Maximal allowed number of slices on K-dim for a GEMM kernel. This controls
+    # the maximal parallelism of K-slicing. Since K-slicing requires extra thread
+    # synchronization and buffers,  the maximal number of slices is limited to
+    # mitigate the sync overhead and memory usage.
+    # When set to 0, the number of slices is unlimited.
+    gemm_max_k_slices = int(os.environ.get("TORCHINDUCTOR_CPP_GEMM_MAX_K_SLICES", "1"))
+    # For perf tuning and debugging purpose, configure the pre-defined cache blocking for
+    # MxNxK dims respectively. The blockings are separated by comma and the unit is
+    # the number of register blocks.
+    # For example, "4,1,10" means 4 register blocks on M, 1 on N and 10 on K respectively.
+    gemm_cache_blocking = os.environ.get("TORCHINDUCTOR_CPP_GEMM_CACHE_BLOCKING", None)
+    # For perf tuning and debugging purpose, configure the pre-defined thread blocking factors for
+    # MxNxK dims respectively. The factors are separated by comma and their product
+    # should be the same as the total number of threads.
+    # For example, if the total number of threads is 56, "7,4,2" means the work is
+    # decomposed into 7x4x2 thread blocks along MxNxK of a GEMM.
+    gemm_thread_factors = os.environ.get("TORCHINDUCTOR_CPP_GEMM_THREAD_FACTORS", None)
+    # Whether to enable masked vectorization for the tail_loop.
+    enable_loop_tail_vec = True
+# config specific to codegen/triton.py
+class triton:
+    # Use cudagraphs on output code
+    cudagraphs = os.environ.get("TORCHINDUCTOR_CUDAGRAPHS") == "1"
+    # Use cudagraph trees for memory pooling if `cudagraphs` is True
+    cudagraph_trees = True
+    # Should we skip cudagraphing graphs with dynamic shape inputs
+    # If False, we will re-record a graph for each unique set of shape inputs
+    cudagraph_skip_dynamic_graphs = False
+    # assertions not on the fast path, steady state
+    slow_path_cudagraph_asserts = True
+    # TODO - need to debug why this prevents cleanup
+    cudagraph_trees_history_recording = False
+    # Enable cudagraph support for mutated inputs from prior cudagraph pool
+    cudagraph_support_input_mutation = False if is_fbcode() else True
+    # Maximal number of allowed cudagraph re-record for a function and
+    # a cudagraph node due to static input tensor address changes or
+    # cudagraph managed tensor data pointer changed.
+    # i.e., allow num_recording <= cudagraph_unexpected_rerecord_limit
+    # note: we are conservative here and choose a large limit.
+    cudagraph_unexpected_rerecord_limit = 128
+    # Warn loudly when the number of cudagraphs due to dynamic shape
+    # exceeds this limit
+    cudagraph_dynamic_shape_warn_limit: Optional[int] = 50
+    # synchronize after cudagraph invocation
+    force_cudagraph_sync = False
+    # always run cudagraphs in the eager warmup stage
+    # instead of recording and executing cudagraphs
+    force_cudagraphs_warmup = False
+    # assertions on the fast path
+    fast_path_cudagraph_asserts = False
+    # skip warmup for cudagraph trees
+    skip_cudagraph_warmup = False
+    # Synchronize before and after every compiled graph.
+    debug_sync_graph = False
+    # Synchronize after every kernel launch, to help pinpoint bugs
+    debug_sync_kernel = False
+    # Always load full blocks (rather than broadcasting inside the block)
+    dense_indexing = False
+    # limit tiling dimensions
+    max_tiles = 2
+    # Prefer higher dimensional tilings. This simplifies indexing expressions, making
+    # it easier to identify block pointers.
+    prefer_nd_tiling: bool = False
+    # use triton.autotune for pointwise ops with complex layouts
+    # this should only be disabled for debugging/testing
+    autotune_pointwise = True
+    # max autotune gemm with cublasLt
+    autotune_cublasLt = True
+    # Tune the generated Triton kernels at compile time instead of first time they run
+    autotune_at_compile_time = False
+    # should we stop a fusion to allow better tiling?
+    tiling_prevents_pointwise_fusion = True
+    tiling_prevents_reduction_fusion = True
+    # should we give different names to kernels
+    # Note: This is orthogonal to descriptive_names - this is deciding whether
+    # our triton kernel names should all be `triton_` (to maximize caching) or
+    # whether they should be unique.
+    unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1"
+    # should we put op names in kernel names
+    # False: No special names (just triton__1, triton__2, etc.)
+    # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
+    # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
+    # "inductor_node": Maps to the node name in the FX graph passed to Inductor
+    descriptive_names = "original_aten"
+    # use alternate codegen for smaller reductions
+    persistent_reductions = (
+        os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
+    )
+    # 0/False: disable
+    # 1/True: enable, use tuning to pick between different subkernels
+    # 2: enable, force using persistent reduction (for debugging)
+    # 3: enable, force using non-persistent reduction (for debugging)
+    multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0"))
+    # hint to Triton when arguments are divisible by 16
+    divisible_by_16 = True
+    # Minimum RBLOCK to be used for a TritonSplitScanKernel
+    # NOTE: This also indirectly controls the size of workspace buffer required
+    min_split_scan_rblock = 256
+    # Store the generated cubin files for cpp wrapper code to load
+    store_cubin = False
+    # the max number of spills we allow for the configs we benchmark.
+    # Setting this to 0 means we skip a config if it spills even a single
+    # register.
+    # Setting it to a larger value allows a config spilling a small amount
+    # of registers being benchmarked.
+    #
+    # NOTE: triton will always report >0 register spills for kernels using sin/cos.
+    # (check this issue https://github.com/openai/triton/issues/1756 )
+    # So far we see a fixed 8 spilled registers for kernels using sin/cos.
+    # Raise the threshold to 16 to be safe.
+    # We should revisit this once we understand more of the source of register spills.
+    spill_threshold: int = 16
+    # Generate code containing the newer tl.make_block_ptr() API for loads/store
+    use_block_ptr = False
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+    # Whether to upcast float16 / bfloat16 to float32 in triton codegen (Experimental)
+    codegen_upcast_to_fp32 = True
+class aot_inductor:
+    # AOTInductor output path
+    # If an absolute path is specified, the generated lib files will be stored under the directory;
+    # If a relative path is specified, it will be used as a subdirectory under the default caching path;
+    # If not specified, a temp directory will be created under the default caching path.
+    # If the specified path contains something like "model.so", the sub-string will be used
+    # to name the generated library.
+    output_path = ""
+    debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
+    debug_dump_consts_bin: bool = (
+        os.environ.get("AOT_INDUCTOR_DEBUG_DUMP_CONSTS_BIN", "0") == "1"
+    )
+    # option for debug printing/saving for intermediate tensor values for aot inductor
+    # 0: disable debug dumping
+    # 1: enable saving intermediate tensor values
+    # 2: enable printing intermediate tensor values
+    debug_intermediate_value_printer = os.environ.get(
+        "AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER", "0"
+    )
+    # filtered nodes to be printed for debug values. Specify this option when debug_intermediate_value_printer is set to 2
+    filtered_kernel_names = os.environ.get(
+        "AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT", None
+    )
+    # Serialized tree spec for flattening inputs
+    serialized_in_spec = ""
+    # Serialized tree spec for flattening outputs
+    serialized_out_spec = ""
+    # flag to decide whether to create a submodule for constant graph.
+    use_runtime_constant_folding: bool = False
+    # flag to force weight to be appened to the shared library and mmaped  by the runtime
+    # rather than embedded into the data section. Needed to support 1B+ parameter models
+    force_mmap_weights: bool = False
+    package: bool = False
+class cuda:
+    # CUDA arch to use for CUDA template kernel compilation.
+    # e.g. "70", "75", "80", "90", etc.
+    # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
+    arch: Optional[str] = None
+    # CUDA version to use for CUDA template kernel compilation.
+    # e.g. "11.4", "12.1", etc.
+    # When version is None, Inductor uses torch.version.cuda.
+    version: Optional[str] = None
+    # Optimization level for the host compiler.
+    compile_opt_level = "-O1"
+    # Whether to enable device LTO (link-time-optimization).
+    enable_cuda_lto = False
+    # Whether to keep intermediate files dring compilation.
+    enable_ptxas_info = False
+    # Whether to enable debug info, e.g. line number, cutlass debug info.
+    enable_debug_info = False
+    # Whether to use fast math.
+    use_fast_math = False
+    # Path to the CUTLASS repo root directory.
+    # The default path only works under PyTorch local development environment.
+    cutlass_dir = os.environ.get(
+        "TORCHINDUCTOR_CUTLASS_DIR",
+        os.path.abspath(
+            os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/")
+        ),
+    )
+    # Configures the maximum number of CUTLASS configs to profile in max_autotune.
+    # By default it's None, so that all CUTLASS configs are tuned.
+    # This is mainly used to reduce test time in CI.
+    cutlass_max_profiling_configs: Optional[int] = None
+    # Path to CUDA NVCC.
+    # NVCC search order:
+    # 1) cuda_cxx set in this config
+    # 2) CUDACXX environment variable
+    # 3) CUDA_HOME environment variable
+    # 4) default system search PATH.
+    cuda_cxx: Optional[str] = None
+    # Minimum value of M*N*K to consider the CUTLASS backend for GEMM ops.
+    cutlass_backend_min_gemm_size: int = 1
+    # enable generation of inline standalone runner in CUDA CPP generated code
+    # which allows to compile the generated code into a standalone executable.
+    generate_test_runner: bool = (
+        os.environ.get("INDUCTOR_CUDA_BACKEND_GENERATE_TEST_RUNNER_CODE", "1") == "1"
+    )
+    # Keep only Cutlass op configs which contain this regular expression pattern
+    # Set this to "warpspecialized_cooperative_epi_tma" to enable only SM90 TMA Cutlass Kernels for large GEMMs
+    cutlass_op_allowlist_regex: Optional[str] = None
+    # Note: Names of Cutlass ops names can be obtained by calling
+    # op.configuration_name() on a Cutlass op instance, for example those
+    # returned from cutlass_utils.gen_ops() or the op argument passed to
+    # CUTLASSGemmTemplate.render(...)
+    # Filter Cutlass configs which contain this regular expression pattern
+    # Set this to "pingpong" to avoid numerical issues
+    # caused by the op ordering of the "pingpong" memory access
+    # pattern used by some Cutlass Kernels.
+    cutlass_op_denylist_regex: Optional[str] = "pingpong"
+class rocm:
+    # Offload arch list for device code compilation, e.g. ["gfx941", "gfx942"].
+    # If empty, the `native` arch is used
+    arch: List[str] = []
+    # Enable the CK backend for CDNA2 and CDNA3 only (for now)
+    # Processor name reference: https://llvm.org/docs/AMDGPUUsage.html#processors
+    ck_supported_arch: List[str] = ["gfx90a", "gfx940", "gfx941", "gfx942"]
+    # Optimization level, use to balance compilation speed and runtime performance
+    compile_opt_level = "-O2"
+    # Flag to keep debug information in compiled objects
+    is_debug = False
+    # Flag to keep intermediate files (assembly listings, preprocessed sources, etc.)
+    save_temps = False
+    # Flag to add `-ffast-math`` to compile flags
+    use_fast_math = True
+    # Flag to add `-fgpu-flush-denormals-to-zero` to compile flags
+    flush_denormals = True
+    # Flag to print register and LDS usage during compilation
+    print_kernel_resource_usage = False
+    # Path to ROCm installation, if None, use env variable ROCM_HOME
+    rocm_home: Optional[str] = None
+    # Path to Composable Kernel library.
+    # Install with `pip install git+https://github.com/rocm/composable_kernel@develop`.
+    ck_dir = os.environ.get("TORCHINDUCTOR_CK_DIR")
+    # Number of op instance choices to trade off between runtime perf and compilation time
+    n_max_profiling_configs: Optional[int] = None
+    # Flag to use a short list of CK instances which perform well across a variety of shapes.
+    # Currently RCR and F16 only
+    use_preselected_instances: bool = False
+# Backend to use for CPU codegen either "cpp" or "halide" (experimental)
+cpu_backend = "cpp"
+# Backend to use for CUDA codegen either "triton" or "halide" (experimental)
+cuda_backend = "triton"
+class halide:
+    # Base halide target to use for CPU devices
+    cpu_target = "host"
+    # Base halide target to use for CUDA devices
+    gpu_target = "host-cuda"
+    # Halide autoscheduler to use, choices are:
+    # "Anderson2021" (gpu-only), "Li2018", "Adams2019" (cpu-only), or "Mullapudi2016" (cpu-only)
+    scheduler_cuda = "Anderson2021"
+    scheduler_cpu = "Adams2019"
+    # Controls `no_asserts` flag passed to Halide target (warning: can false positive)
+    asserts = False
+    # Controls `debug` flag passed to Halide target
+    debug = False
+    # Enable (or fallback on) scan kernels such as cumsum
+    # Halide autoschedulers struggle with these kernels
+    scan_kernels = False
+# create a directory containing lots of debug information
+class trace:
+    # master switch for all debugging flags below
+    enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+    # Save debug information to a temporary directory
+    # If not specified, a temp directory will be created by system
+    debug_dir: Optional[str] = None
+    # Save python logger call >=logging.DEBUG
+    debug_log = False
+    # Save python logger call >=logging.INFO
+    info_log = False
+    # Save input FX graph (post decomps, pre optimization)
+    fx_graph = True
+    # Save FX graph after transformations
+    fx_graph_transformed = True
+    # Save TorchInductor IR before fusion pass
+    ir_pre_fusion = True
+    # Save TorchInductor IR after fusion pass
+    ir_post_fusion = True
+    # Copy generated code to trace dir
+    output_code = True
+    # SVG figure showing post-fusion graph
+    graph_diagram = os.environ.get("INDUCTOR_POST_FUSION_SVG", "0") == "1"
+    # SVG figure showing fx with fusion
+    draw_orig_fx_graph = os.environ.get("INDUCTOR_ORIG_FX_SVG", "0") == "1"
+    # We draw our fx graphs with the "record" shape attribute by default.
+    # Sometimes, when the graph is very complex, we may hit dot errors like below:
+    #   "flat edge between adjacent nodes one of which has a record shape -
+    #    replace records with HTML-like labels"
+    # and thus fail to generate a graph. So, let's give the user an option
+    # to specify the shape attribute for the dot graph. For example, passing
+    # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables
+    # to workaround the above failure.
+    dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None)
+    # If not None, this is the URL that saves the SVG files of the input/output
+    # graph of each pass that changed the graph
+    # The nodes that are being transformed in each pass will be colored in yellow
+    # URL only supports local directory for now
+    log_url_for_graph_xform = os.environ.get("INDUCTOR_LOG_URL_FOR_GRAPH_XFORM", None)
+    # Store cProfile (see snakeviz to view)
+    compile_profile = False
+    # Upload the .tar.gz file
+    # Needs to be overriden based on specific environment needs
+    upload_tar: Optional[Callable[[str], None]] = None
+    log_autotuning_results: bool = False
+_save_config_ignore = [
+    # workaround: "Can't pickle <function ...>"
+    "trace.upload_tar",
+    "post_grad_custom_post_pass",
+    "post_grad_custom_pre_pass",
+    "joint_custom_pre_pass",
+    "joint_custom_post_pass",
+    "pre_grad_custom_pass",
+]
+_cache_config_ignore_prefix = [
+    # trace functions are not relevant to config caching
+    "trace",
+    # uses absolute path
+    "cuda.cutlass_dir",
+    # not relevant
+    "compile_threads",
+]
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+from torch.utils._config_module import install_config_module
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])

.venv/lib/python3.11/site-packages/torch/_inductor/constant_folding.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import collections
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import torch
+import torch.utils._pytree as pytree
+aten = torch.ops.aten
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
+def replace_node_with_constant(
+    gm: torch.fx.GraphModule,
+    node: torch.fx.Node,
+    constant: torch.Tensor,
+    name: Optional[str] = None,
+) -> None:
+    g = gm.graph
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0  # type: ignore[assignment]
+        i = gm._frozen_param_count
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
+        gm._frozen_param_count = i + 1
+    with g.inserting_before(node):
+        new_input_node = g.create_node("get_attr", qualname, (), {})
+        node.replace_all_uses_with(new_input_node)
+        new_input_node.meta.update(node.meta)
+        g.erase_node(node)
+    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+    gm.register_buffer(qualname, constant)
+    setattr(gm, qualname, constant)
+def is_const_source(
+    node: torch.fx.Node, lifted_constants: Optional[Dict[str, Any]]
+) -> bool:
+    return node.op == "get_attr" or (
+        node.op == "placeholder"
+        and lifted_constants is not None
+        and node.name in lifted_constants
+    )
+class ConstantFolder(torch.fx.Interpreter):
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        skip_constructors: bool = False,
+        lifted_constants: Optional[Dict[str, torch.Tensor]] = None,
+        skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+    ) -> None:
+        super().__init__(gm)
+        self.node_replacements: Dict[torch.fx.Node, Any] = {}
+        self.replaced_uses: Dict[torch.fx.Node, int] = collections.Counter()
+        self.unknown_value = object()
+        self.skip_constructors: bool = skip_constructors
+        # overwrite this to deallocate env values if their only remaining use
+        # is the output
+        self.user_to_last_uses = self.node_to_last_non_output_use()
+        self.lifted_constants = lifted_constants
+    def _support_dynamic_shape(self) -> bool:
+        # ConstantFolder not support dynamic shape now
+        return False
+    def _deduce_value(self, node: torch.fx.Node) -> Any:
+        return super().run_node(node)
+    def is_impure(self, node: torch.fx.node.Node) -> bool:
+        if (
+            node.target == torch.ops.prims.convert_element_type.default
+            and is_const_source(node.args[0], self.lifted_constants)  # type: ignore[arg-type]
+            and node.args[0].meta["val"].dtype == torch.int8  # type: ignore[union-attr]
+            and node.args[1] == torch.bfloat16
+        ):
+            # For int8_weight -> dq -> bf16_weight
+            return True
+        if node.target in [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+        ]:
+            # For the pattern fp32_weight -> q -> dq
+            # We only folding fp32_weight -> q
+            # int8_weight and leave dq in graph to be fused
+            return True
+        return False
+    def node_to_last_non_output_use(self) -> Dict[torch.fx.Node, List[torch.fx.Node]]:
+        last_non_output_use = collections.defaultdict(list)
+        seen_uses = set()
+        output_node = next(iter(reversed(self.module.graph.nodes)))
+        for node in reversed(self.module.graph.nodes):
+            if node.target == "output":
+                continue
+            def add_use(inp: torch.fx.Node) -> None:
+                if inp in seen_uses:
+                    return
+                seen_uses.add(inp)
+                last_non_output_use[node].append(inp)
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, add_use, (node.args, node.kwargs))
+            # if this node is only used in output, we want to gc it right away
+            if len(node.users) == 1 and output_node in node.users:
+                last_non_output_use[node].append(node)
+        return last_non_output_use
+    def run_node(self, node: torch.fx.Node) -> Any:
+        if node.target == "output":
+            # because we remove nodes from env on last non output use,
+            # re-define them now or we'll get error in interpreter
+            def set_env(arg: torch.fx.Node) -> None:
+                self.env[arg] = self.unknown_value
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, set_env, node.args)
+            return super().run_node(node)
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+        # We need to do this weird thing because in cases where flattened_inputs
+        # contains a ScriptObject, equality checking results in a type error if
+        # the types are different.
+        if any(
+            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+            for input_ in flattened_inputs
+        ):
+            return self.unknown_value
+        # TODO - fix errors with this
+        if (
+            node.op == "call_function"
+            and node.target == aten._efficientzerotensor.default
+        ):
+            return self.unknown_value
+        # TODO - constant folding triton kernel returns the inputs -- fix this
+        if (
+            node.op == "call_function"
+            and node.name == "triton_kernel_wrapper_functional_proxy"
+        ):
+            return self.unknown_value
+        # skip constructors, since inductor generates optimal code for them already
+        # and turning into tensor would result in an additional global memory read
+        # TODO - more complicated strategy
+        if (
+            self.skip_constructors
+            and not is_const_source(node, self.lifted_constants)
+            and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
+        ):
+            return self.unknown_value
+        # All mutations should either be removed or on inputs which we did not make constant
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            return self.unknown_value
+        out = self._deduce_value(node)
+        if out == self.unknown_value:
+            return self.unknown_value
+        if not is_const_source(node, self.lifted_constants) and isinstance(
+            out, torch.Tensor
+        ):
+            if out.device.type == "meta":
+                return out
+            if not self.insertable_tensor_check(out):
+                return out
+            if self.is_impure(node):
+                return self.unknown_value
+            self.add_node_replacement(node, out)
+            flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            for n in flattened_node_inps:
+                if not isinstance(n, torch.fx.Node):
+                    continue
+                self.replaced_uses[n] += 1
+            for to_delete in self.user_to_last_uses.get(node, []):
+                if self.replaced_uses[to_delete] == len(to_delete.users):
+                    self.node_replacements.pop(to_delete, None)
+        return out
+    def insertable_tensor_check(self, tensor: torch.Tensor) -> bool:
+        return True
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor
+    def run(self) -> Any:  # type: ignore[override]
+        env: Dict[torch.fx.Node, Any] = {}
+        self.insert_placerholder_values(env)
+        return super().run(initial_env=env)
+    def insert_placerholder_values(self, env: Dict[torch.fx.Node, Any]) -> None:
+        for n in self.module.graph.find_nodes(op="placeholder"):
+            if self.lifted_constants is not None and n.name in self.lifted_constants:
+                env[n] = self.lifted_constants[n.name]
+            else:
+                env[n] = self.unknown_value  # type: ignore[assignment]
+def constant_fold(
+    gm: torch.fx.GraphModule,
+    constraint_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+) -> None:
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm, skip_constructors=True)
+        cf.run()
+        for node, constant in cf.node_replacements.items():
+            if constraint_fn is not None and not constraint_fn(node):
+                continue
+            replace_node_with_constant(gm, node, constant)
+        erased_params = []
+        for node in gm.graph.find_nodes(op="get_attr"):
+            if len(node.users) == 0:
+                if hasattr(gm, node.target):
+                    delattr(gm, node.target)
+                erased_params.append(node)
+        for node in erased_params:
+            gm.graph.erase_node(node)
+        gm.graph.eliminate_dead_code()
+        gm.graph.lint()
+        gm.recompile()
+def constant_graph_tag(
+    gm: torch.fx.GraphModule,
+    lifted_constants: Optional[Dict[str, Any]],
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
+) -> None:
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(
+            gm, skip_constructors=True, lifted_constants=lifted_constants
+        )
+        cf.run()
+        for node in gm.graph.nodes:
+            if skip_folding_node_fn is not None and skip_folding_node_fn(node):
+                node.meta[META_TAG] = MODULE_TAG
+                continue
+            if (
+                is_const_source(node, lifted_constants)
+                or node in cf.node_replacements
+                or node in cf.replaced_uses
+            ):
+                node.meta[META_TAG] = CONST_MODULE_TAG
+            else:
+                node.meta[META_TAG] = MODULE_TAG
+def run_and_get_constant_graph(
+    gm: torch.fx.GraphModule,
+    lifted_constants: Optional[Dict[str, Any]],
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
+) -> Tuple[torch.fx.GraphModule, Tuple[torch.Tensor, ...]]:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+    constant_graph_tag(gm, lifted_constants, skip_folding_node_fn)
+    def untag(node: torch.fx.Node) -> bool:
+        used_to_fold = False
+        for u in node.users:
+            if u.meta[META_TAG] == CONST_MODULE_TAG:
+                used_to_fold = True
+                break
+        if not used_to_fold:
+            node.meta[META_TAG] = MODULE_TAG
+        return used_to_fold
+    const_args = []
+    if lifted_constants is not None:
+        placeholders = list(gm.graph.find_nodes(op="placeholder"))
+        for node in placeholders:
+            if node.meta[META_TAG] == MODULE_TAG:
+                continue
+            if untag(node):
+                const_args.append(lifted_constants[node.name])
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.find_nodes(op="get_attr"):
+        untag(node)
+    new_graph = torch.fx.Graph()
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    const_result = new_gm(*const_args)
+    return new_gm, const_result

.venv/lib/python3.11/site-packages/torch/_inductor/cpu_vec_isa.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# mypy: allow-untyped-defs
+import dataclasses
+import functools
+import os
+import platform
+import re
+import subprocess
+import sys
+from typing import Any, Callable, Dict, List
+import torch
+from torch._inductor import config
+_IS_WINDOWS = sys.platform == "win32"
+def _get_isa_dry_compile_fingerprint(isa_flags: str) -> str:
+    # ISA dry compile will cost about 1 sec time each startup time.
+    # Please check the issue: https://github.com/pytorch/pytorch/issues/100378
+    # Actually, dry compile is checking compile capability for ISA.
+    # We just record the compiler version, isa options and pytorch version info,
+    # and generated them to output binary hash path.
+    # It would optimize and skip compile existing binary.
+    from torch._inductor.cpp_builder import get_compiler_version_info, get_cpp_compiler
+    compiler_info = get_compiler_version_info(get_cpp_compiler())
+    torch_version = torch.__version__
+    fingerprint = f"{compiler_info}={isa_flags}={torch_version}"
+    return fingerprint
+class VecISA:
+    _bit_width: int
+    _macro: List[str]
+    _arch_flags: str
+    _dtype_nelements: Dict[torch.dtype, int]
+    # Note [Checking for Vectorized Support in Inductor]
+    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
+    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
+    # like exp, pow, sin, cos and etc.
+    # But PyTorch and TorchInductor might use different compilers to build code. If
+    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
+    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
+    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
+    # gcc/g++ compiler by default while it could support the AVX512 compilation.
+    # Therefore, there would be a conflict sleef version between PyTorch and
+    # TorchInductor. Hence, we dry-compile the following code to check whether current
+    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
+    # also needs the logic
+    # In fbcode however, we are using the same compiler for pytorch and for inductor codegen,
+    # making the runtime check unnecessary.
+    _avx_code = """
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+alignas(64) float in_out_ptr0[16] = {0.0};
+extern "C" void __avx_chk_kernel() {
+    auto tmp0 = at::vec::Vectorized<float>(1);
+    auto tmp1 = tmp0.exp();
+    tmp1.store(in_out_ptr0);
+}
+"""  # noqa: B950
+    _avx_py_load = """
+import torch
+from ctypes import cdll
+cdll.LoadLibrary("__lib_path__")
+"""
+    def bit_width(self) -> int:
+        return self._bit_width
+    def nelements(self, dtype: torch.dtype = torch.float) -> int:
+        return self._dtype_nelements[dtype]
+    def build_macro(self) -> List[str]:
+        return self._macro
+    def build_arch_flags(self) -> str:
+        return self._arch_flags
+    def __hash__(self) -> int:
+        return hash(str(self))
+    def check_build(self, code: str) -> bool:
+        from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT, write
+        from torch._inductor.cpp_builder import (
+            CppBuilder,
+            CppTorchOptions,
+            normalize_path_separator,
+        )
+        key, input_path = write(
+            code,
+            "cpp",
+            extra=_get_isa_dry_compile_fingerprint(self._arch_flags),
+        )
+        from filelock import FileLock
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            output_dir = os.path.dirname(input_path)
+            buid_options = CppTorchOptions(vec_isa=self, warning_all=False)
+            x86_isa_help_builder = CppBuilder(
+                key,
+                [input_path],
+                buid_options,
+                output_dir,
+            )
+            try:
+                # Check if the output file exist, and compile when not.
+                output_path = normalize_path_separator(
+                    x86_isa_help_builder.get_target_file_path()
+                )
+                if not os.path.isfile(output_path):
+                    status, target_file = x86_isa_help_builder.build()
+                # Check build result
+                subprocess.check_call(
+                    [
+                        sys.executable,
+                        "-c",
+                        VecISA._avx_py_load.replace("__lib_path__", output_path),
+                    ],
+                    cwd=output_dir,
+                    stderr=subprocess.DEVNULL,
+                    env={**os.environ, "PYTHONPATH": ":".join(sys.path)},
+                )
+            except Exception as e:
+                return False
+            return True
+    @functools.lru_cache(None)  # noqa: B019
+    def __bool__(self) -> bool:
+        if config.cpp.vec_isa_ok is not None:
+            return config.cpp.vec_isa_ok
+        if config.is_fbcode():
+            return True
+        return self.check_build(VecISA._avx_code)
+@dataclasses.dataclass
+class VecNEON(VecISA):
+    _bit_width = 256  # This is required to leverage the compute implemented in aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+    _macro = ["CPU_CAPABILITY_NEON"]
+    if sys.platform == "darwin" and platform.processor() == "arm":
+        _macro.append("AT_BUILD_ARM_VEC256_WITH_SLEEF")
+    _arch_flags = ""  # Unused
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+    def __str__(self) -> str:
+        return "asimd"  # detects the presence of advanced SIMD on armv8-a kernels
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+@dataclasses.dataclass
+class VecAVX512(VecISA):
+    _bit_width = 512
+    _macro = ["CPU_CAPABILITY_AVX512"]
+    _arch_flags = (
+        "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
+        if not _IS_WINDOWS
+        else "/arch:AVX512"
+    )  # TODO: use cflags
+    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+    def __str__(self) -> str:
+        return "avx512"
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+@dataclasses.dataclass
+class VecAMX(VecAVX512):
+    _arch_flags = VecAVX512._arch_flags + " -mamx-tile -mamx-bf16 -mamx-int8"
+    def __str__(self) -> str:
+        return super().__str__() + " amx_tile"
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+    _amx_code = """
+#include <cstdint>
+#include <immintrin.h>
+struct amx_tilecfg {
+  uint8_t palette_id;
+  uint8_t start_row;
+  uint8_t reserved_0[14];
+  uint16_t colsb[16];
+  uint8_t rows[16];
+};
+extern "C" void __amx_chk_kernel() {
+  amx_tilecfg cfg = {0};
+  _tile_loadconfig(&cfg);
+  _tile_zero(0);
+  _tile_dpbf16ps(0, 1, 2);
+  _tile_dpbusd(0, 1, 2);
+}
+"""
+    @functools.lru_cache(None)  # noqa: B019
+    def __bool__(self) -> bool:
+        if super().__bool__():
+            if config.is_fbcode():
+                return False
+            if self.check_build(VecAMX._amx_code) and torch.cpu._init_amx():
+                return True
+        return False
+@dataclasses.dataclass
+class VecAVX2(VecISA):
+    _bit_width = 256
+    _macro = ["CPU_CAPABILITY_AVX2"]
+    _arch_flags = (
+        "-mavx2 -mfma -mf16c" if not _IS_WINDOWS else "/arch:AVX2"
+    )  # TODO: use cflags
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+    def __str__(self) -> str:
+        return "avx2"
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+@dataclasses.dataclass
+class VecZVECTOR(VecISA):
+    _bit_width = 256
+    _macro = [
+        "CPU_CAPABILITY_ZVECTOR",
+        "CPU_CAPABILITY=ZVECTOR",
+        "HAVE_ZVECTOR_CPU_DEFINITION",
+    ]
+    _arch_flags = "-mvx -mzvector"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+    def __str__(self) -> str:
+        return "zvector"
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+@dataclasses.dataclass
+class VecVSX(VecISA):
+    _bit_width = 256  # VSX simd supports 128 bit_width, but aten is emulating it as 256
+    _macro = ["CPU_CAPABILITY_VSX"]
+    _arch_flags = "-mvsx"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+    def __str__(self) -> str:
+        return "vsx"
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+class InvalidVecISA(VecISA):
+    _bit_width = 0
+    _macro = [""]
+    _arch_flags = ""
+    _dtype_nelements = {}
+    def __str__(self) -> str:
+        return "INVALID_VEC_ISA"
+    def __bool__(self) -> bool:  # type: ignore[override]
+        return False
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+def x86_isa_checker() -> List[str]:
+    supported_isa: List[str] = []
+    def _check_and_append_supported_isa(
+        dest: List[str], isa_supported: bool, isa_name: str
+    ) -> None:
+        if isa_supported:
+            dest.append(isa_name)
+    Arch = platform.machine()
+    """
+    Arch value is x86_64 on Linux, and the value is AMD64 on Windows.
+    """
+    if Arch != "x86_64" and Arch != "AMD64":
+        return supported_isa
+    avx2 = torch.cpu._is_avx2_supported()
+    avx512 = torch.cpu._is_avx512_supported()
+    amx_tile = torch.cpu._is_amx_tile_supported()
+    _check_and_append_supported_isa(supported_isa, avx2, "avx2")
+    _check_and_append_supported_isa(supported_isa, avx512, "avx512")
+    _check_and_append_supported_isa(supported_isa, amx_tile, "amx_tile")
+    return supported_isa
+invalid_vec_isa = InvalidVecISA()
+supported_vec_isa_list = [VecAMX(), VecAVX512(), VecAVX2(), VecNEON()]
+# Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
+# might have too much redundant content that is useless for ISA check. Hence,
+# we only cache some key isa information.
+@functools.lru_cache(None)
+def valid_vec_isa_list() -> List[VecISA]:
+    isa_list: List[VecISA] = []
+    if sys.platform == "darwin" and platform.processor() == "arm":
+        isa_list.append(VecNEON())
+    if sys.platform not in ["linux", "win32"]:
+        return isa_list
+    arch = platform.machine()
+    if arch == "s390x":
+        with open("/proc/cpuinfo") as _cpu_info:
+            while True:
+                line = _cpu_info.readline()
+                if not line:
+                    break
+                # process line
+                featuresmatch = re.match(r"^features\s*:\s*(.*)$", line)
+                if featuresmatch:
+                    for group in featuresmatch.groups():
+                        if re.search(r"[\^ ]+vxe[\$ ]+", group):
+                            isa_list.append(VecZVECTOR())
+                            break
+    elif arch == "ppc64le":
+        isa_list.append(VecVSX())
+    elif arch == "aarch64":
+        isa_list.append(VecNEON())
+    elif arch in ["x86_64", "AMD64"]:
+        """
+        arch value is x86_64 on Linux, and the value is AMD64 on Windows.
+        """
+        _cpu_supported_x86_isa = x86_isa_checker()
+        for isa in supported_vec_isa_list:
+            if all(flag in _cpu_supported_x86_isa for flag in str(isa).split()) and isa:
+                isa_list.append(isa)
+    return isa_list
+def pick_vec_isa() -> VecISA:
+    if config.is_fbcode() and (platform.machine() in ["x86_64", "AMD64"]):
+        return VecAVX2()
+    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
+    if not _valid_vec_isa_list:
+        return invalid_vec_isa
+    # If the simdlen is None, it indicates determine the vectorization length automatically
+    if config.cpp.simdlen is None:
+        assert _valid_vec_isa_list
+        return _valid_vec_isa_list[0]
+    for isa in _valid_vec_isa_list:
+        if config.cpp.simdlen == isa.bit_width():
+            return isa
+    return invalid_vec_isa

.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_trees.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import dataclasses
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+from torch._dynamo.utils import counters
+from torch._inductor.utils import InputType
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+static_inputs_log = torch._logging.getArtifactLogger(
+    __name__, "cudagraph_static_inputs"
+)
+OutputType = List[Optional[Union[int, torch.Tensor]]]
+ModelType = Callable[[List[InputType]], OutputType]
+@dataclasses.dataclass(frozen=True)
+class FunctionID:
+    "Unique counter of a function wrapped in cudagraphify_impl"
+    id: int
+@dataclasses.dataclass(frozen=True)
+class PlaceholderInfo:
+    """
+    A serializable version of torch.fx.Node that contains information
+    pertinent to placeholder stack traces. We use these in logging and error messages
+    related to cudagraphs, and will cache these results.
+    """
+    name: str
+    stack_trace: Optional[str]
+    # This field is recursive, but never cyclic (since a node never uses itself)
+    users: List[PlaceholderInfo]
+    mutating_use_stack_trace: Optional[str]
+@dataclasses.dataclass(frozen=True)
+class WrappedFunction:
+    """
+    Represents a function that you want to record for CUDA graph replay,
+    with a little more metadata so we can identify if we have an applicable
+    CUDA graph in our CUDA graph tree for it.
+    """
+    model: Callable[..., Any]
+    static_input_idxs: Sequence[int]
+    id: FunctionID
+    constants: Tuple[torch.Tensor, ...]
+    placeholders: Sequence[PlaceholderInfo]
+    mutated_input_idxs: Sequence[int]
+def get_mutating_use_stack_trace_from_node(
+    placeholder_node: torch.fx.Node,
+) -> Optional[str]:
+    # reinplaced uses might have a single, non-copy_ use
+    if len(placeholder_node.users) == 1:
+        return next(iter(placeholder_node.users)).meta.get("stack_trace", None)
+    for use in placeholder_node.users:
+        if use.target == torch.ops.aten.copy_.default:
+            if stack_trace := use.meta.get("stack_trace", None):
+                return stack_trace
+    return None
+def get_mutating_use_stack_trace(placeholder_info: PlaceholderInfo) -> Optional[str]:
+    return placeholder_info.mutating_use_stack_trace
+def to_placeholder_info(placeholder_node: torch.fx.Node) -> PlaceholderInfo:
+    name = placeholder_node.name
+    stack_trace = placeholder_node.meta.get("stack_trace", None)
+    users = []
+    mutating_use_stack_trace = None
+    # Only recurse to users once, since we only care about user's stack traces
+    if placeholder_node.op == "placeholder":
+        users = [to_placeholder_info(i) for i in placeholder_node.users]
+        mutating_use_stack_trace = get_mutating_use_stack_trace_from_node(
+            placeholder_node
+        )
+    return PlaceholderInfo(name, stack_trace, users, mutating_use_stack_trace)
+def get_placeholder_info(graph: torch.fx.Graph) -> List[PlaceholderInfo]:
+    return [
+        to_placeholder_info(node) for node in graph.nodes if node.op == "placeholder"
+    ]
+def format_default_skip_message(reason: str) -> str:
+    return f"skipping cudagraphs due to {reason}"
+def get_mutation_stack_trace(
+    placeholders: Sequence[PlaceholderInfo], mutation_indices: Sequence[int]
+) -> str:
+    stack_trace: Optional[str] = ""
+    for idx in mutation_indices:
+        placeholder = placeholders[idx]
+        if stack_trace := get_mutating_use_stack_trace(placeholder):
+            break
+    msg = format_default_skip_message(
+        f"mutated inputs ({len(mutation_indices)} instances)"
+    )
+    if stack_trace:
+        return f"{msg}. Found from : \n {stack_trace}"
+    return msg
+def check_for_mutation(
+    func: WrappedFunction,
+    inputs: List[InputType],
+    is_cuda_graph_recorded_tensor: Callable[[torch.Tensor], bool],
+) -> Optional[str]:
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices: Sequence[int] = [
+            idx
+            for idx in func.mutated_input_idxs
+            if not (
+                idx in func.static_input_idxs
+                or is_cuda_graph_recorded_tensor(inputs[idx])  # type: ignore[arg-type]
+            )
+        ]
+    else:
+        mutation_indices = func.mutated_input_idxs
+    static_inputs_log.debug(
+        "check mutation static input indices: %s", func.static_input_idxs
+    )
+    static_inputs_log.debug("check mutation mutation indices: %s", mutation_indices)
+    return (
+        get_mutation_stack_trace(func.placeholders, mutation_indices)
+        if mutation_indices
+        else None
+    )
+def _get_use_stack_trace(node) -> Optional[str]:
+    for use in node.users:
+        if stack_trace := use.meta.get("stack_trace", None):
+            return stack_trace
+    return None
+def check_multiple_devices_or_any_cpu_nodes(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+) -> Optional[str]:
+    if cpu_node := device_node_mapping.get(torch.device("cpu")):
+        msg = f"cpu device ({cpu_node.name})"
+        if stack_trace := _get_use_stack_trace(cpu_node):
+            return format_default_skip_message(f"{msg}. Found from : \n {stack_trace}")
+        return format_default_skip_message(msg)
+    if (
+        len(device_node_mapping) == 1
+        and next(iter(device_node_mapping.keys())).type == "cuda"
+    ):
+        return None
+    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
+def check_lowering_disable_cudagraph(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+):
+    return check_multiple_devices_or_any_cpu_nodes(device_node_mapping)
+def log_cudagraph_skip_and_bump_counter(msg):
+    perf_hint_log.warning(msg)
+    counters["inductor"]["cudagraph_skips"] += 1
+@dataclasses.dataclass
+class BoxedDeviceIndex:
+    value: Optional[int]
+    def set(self, device_idx: Optional[int]):
+        assert device_idx is None or isinstance(device_idx, int)
+        self.value = device_idx
+def check_for_mutation_ignore_cuda_graph_managed_tensor(
+    gm: torch.fx.GraphModule, compiled_graph, static_input_idxs: Sequence[int]
+) -> Optional[str]:
+    default_msg = format_default_skip_message("mutated inputs")
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        unique_idxs = set(static_input_idxs)
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx for idx in compiled_graph.mutated_input_idxs if idx not in unique_idxs
+        ]
+        has_mutation = len(mutation_indices) != 0
+        if not has_mutation:
+            return None
+        placeholders = get_placeholder_info(gm.graph)
+        return get_mutation_stack_trace(placeholders, mutation_indices)
+    else:
+        has_mutation = len(compiled_graph.mutated_inputs) != 0
+        return None if not has_mutation else default_msg
+def get_placeholder_stack_trace(placeholder: PlaceholderInfo) -> Optional[str]:
+    """
+    Gets the first non-empty stack trace of a placeholder or its users.
+    """
+    if placeholder.stack_trace:
+        return placeholder.stack_trace
+    for user in placeholder.users:
+        if user.stack_trace:
+            return user.stack_trace
+    return None
+class CheckInvariantStatus(Enum):
+    # Check invariant succeeded
+    SUCCESS = 1
+    # Previously managed data pointers are not stable
+    CudagraphManagedIdxMismatch = 2
+    # Static tensor input addresses are not stable
+    StaticInputIdxMismatch = 3
+    # Expected dead indices before graph are live
+    ExpectedDeadIndicesBeforeGraphMismatch = 4
+    def __str__(self) -> str:
+        if self.name == "CudagraphManagedIdxMismatch":
+            return "cudagraph managed tensor data pointer changed"
+        elif self.name == "StaticInputIdxMismatch":
+            return "static input data pointer changed"
+        elif self.name == "ExpectedDeadIndicesBeforeGraphMismatch":
+            return "expected dead indices before graph are live"
+        else:
+            return f"{self.name}: {self.value}"
+def log_data_ptr_mismatch(
+    placeholders: Sequence[PlaceholderInfo],
+    inputs: List[InputType],
+    recorded_data_ptr: Sequence[Optional[int]],
+    target_idxs: Sequence[int],
+    mismatch: CheckInvariantStatus,
+) -> str:
+    """
+    Logs the mismatch between input data pointers and recorded data pointers.
+    This checks only idxs in target_idxs.
+    """
+    assert len(inputs) == len(recorded_data_ptr) and len(inputs) == len(
+        placeholders
+    ), "length mismatch between inputs, recorded_data_ptr, and placeholders"
+    t_tensors = [inputs[i] for i in target_idxs]
+    t_data_ptrs = [recorded_data_ptr[i] for i in target_idxs]
+    error_msg = f"{mismatch}.\n"
+    for i, (tensor, data_ptr) in enumerate(zip(t_tensors, t_data_ptrs)):
+        assert isinstance(tensor, torch.Tensor)
+        index = target_idxs[i]
+        if tensor.data_ptr() != data_ptr:
+            placeholder = placeholders[index]
+            error_msg = (
+                f"{error_msg}input name: {placeholder.name}. "
+                f"data pointer changed from {data_ptr} to {tensor.data_ptr()}. "
+                f"input stack trace: {get_placeholder_stack_trace(placeholder)}\n"
+            )
+    return error_msg
+def maybe_warning_due_to_dynamic_shape(
+    fn_cache: Dict[Tuple[int, ...], Callable[..., Any]],
+    new_int_key: Any,
+) -> bool:
+    num_cudagraphs = len(fn_cache.keys()) + 1
+    def warn_msg():
+        return (
+            "CUDAGraph supports dynamic shapes by recording a new graph for each "
+            "distinct input size. Recording too many CUDAGraphs may lead to "
+            f"extra overhead. We have observed {num_cudagraphs} distinct sizes. "
+            "Please consider the following options for better performance: "
+            "a) padding inputs to a few fixed number of shapes; or b) set "
+            "torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. "
+            "Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None "
+            "to silence this warning."
+        )
+    if (
+        torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit
+        and num_cudagraphs
+        > torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit
+    ):
+        perf_hint_log.warning(warn_msg())
+        return True
+    return False
+@dataclasses.dataclass(frozen=True)
+class CudagraphCachedInfo:
+    """
+    Info needed to realign inputs
+    """
+    placeholders: Sequence[PlaceholderInfo]
+    stack_traces: List[Optional[str]]
+    cudagraph_fail_reasons: List[str]

.venv/lib/python3.11/site-packages/torch/_inductor/debug.py ADDED Viewed

	@@ -0,0 +1,693 @@

+import collections
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import os
+import os.path
+import pickle
+import pstats
+import shutil
+import subprocess
+from typing import Any, Callable, Dict, IO, Iterator, List, Optional, Type, Union
+from unittest.mock import patch
+import torch
+from functorch.compile import draw_graph, get_aot_graph_name, get_graph_being_compiled
+from torch import fx as fx
+from torch._dynamo.repro.after_aot import save_graph_repro
+from torch._dynamo.utils import get_debug_dir
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.fx.passes.tools_common import legalize_graph
+from torch.utils._pytree import tree_map
+from . import config, ir  # noqa: F811, this is needed
+from .scheduler import (
+    BaseSchedulerNode,
+    FusedSchedulerNode,
+    NopKernelSchedulerNode,
+    OutputNode,
+    SchedulerNode,
+)
+from .virtualized import V
+log = logging.getLogger(__name__)
+SchedulerNodeList = List[Any]
+BufMeta = collections.namedtuple("BufMeta", ["name", "n_origin"])
+GRAPHVIZ_COMMAND_SCALABLE = ["dot", "-Gnslimit=2", "-Gnslimit1=2", "-Gmaxiter=5000"]
+@functools.lru_cache(None)
+def has_dot() -> bool:
+    try:
+        subprocess.check_output(["which", "dot"], stderr=subprocess.PIPE)
+        return True
+    except subprocess.SubprocessError:
+        return False
+def draw_buffers(
+    nodes: List[BaseSchedulerNode],
+    print_graph: bool = False,
+    fname: Optional[str] = None,
+) -> None:
+    """
+    Draw a graph in fname.svg.
+    """
+    if not has_dot():
+        log.warning("draw_buffers() requires `graphviz` package")
+        return
+    if fname is None:
+        fname = get_graph_being_compiled()
+    graph = create_fx_from_snodes(nodes)
+    for node in graph.nodes:
+        if "fusion_meta" not in node.meta:
+            continue
+        group = node.meta["fusion_meta"].group
+        if isinstance(group, tuple):
+            if isinstance(group[1], int):
+                group = (group[1],)
+            else:
+                group = group[1]
+        # gather meta data
+        dtype = None
+        if isinstance(node, ir.ComputedBuffer):
+            dtype = node.data.dtype
+        metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
+        node.meta["tensor_meta"] = metadata
+    if print_graph:
+        print(graph)
+    gm = GraphModule({}, graph)
+    legalize_graph(gm)
+    gm.graph.lint()
+    draw_graph(
+        gm, fname, clear_meta=False, dot_graph_shape=config.trace.dot_graph_shape
+    )
+def create_fx_from_snodes(snodes: List[BaseSchedulerNode]) -> fx.Graph:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+    def get_fake_func(name: str) -> Callable[..., int]:
+        def func1(*args: Any) -> int:
+            return 0
+        func1.__name__ = name
+        return func1
+    FusionMeta = collections.namedtuple("FusionMeta", ["group", "snode", "type"])
+    buf_to_fx_node = {}
+    node_to_fx_node = {}
+    graph = torch.fx.Graph()
+    first_node = None
+    outputs = []
+    group: Any = None
+    # create call_function node for each Buffer and Kernel
+    for snode in snodes:
+        if snode.is_extern():
+            node_type = "extern"
+            group = node_type
+        elif snode.is_template():
+            node_type = "template"
+            group = node_type
+        elif isinstance(snode, NopKernelSchedulerNode):
+            node_type = "nop"
+            group = node_type
+        elif isinstance(snode, SchedulerNode):
+            node_type = "compute"
+            group = snode.group
+        elif isinstance(snode, FusedSchedulerNode):
+            node_type = "fused"
+            group = snode.group
+        else:
+            raise RuntimeError("Unknown node type")
+        fused_name = torch._inductor.utils.get_fused_kernel_name(
+            snode.get_nodes(), "original_aten"
+        )
+        func_name = f"{node_type}: {fused_name}"
+        node_func = get_fake_func(func_name)
+        kwargs = {}
+        if hasattr(snode, "get_device"):
+            kwargs = {"device": snode.get_device()}
+        fx_node = graph.call_function(node_func, args=(), kwargs=kwargs)  # type: ignore[arg-type]
+        def in_output(snode: Union[BaseSchedulerNode, FusedSchedulerNode]) -> bool:
+            if isinstance(snode, FusedSchedulerNode):
+                return any(in_output(x) for x in snode.snodes)
+            return any(
+                isinstance(user.node, OutputNode)
+                for buf in snode.get_outputs()
+                for user in buf.users
+            )
+        if in_output(snode):
+            outputs.append(fx_node)
+        name = snode.get_name()
+        fx_node.name = name
+        fx_node.meta["fusion_meta"] = FusionMeta(group, snode, node_type)
+        node_to_fx_node[name] = fx_node
+        for buf in snode.get_outputs():
+            buf_to_fx_node[buf.get_name()] = fx_node
+        if first_node is None:
+            first_node = fx_node
+    # create edges between nodes
+    for snode in snodes:
+        name = snode.get_name()
+        deps = snode.read_writes.reads
+        fx_node = node_to_fx_node[name]
+        new_args = []
+        for dep in deps:
+            if dep.name in buf_to_fx_node:
+                dep_node = buf_to_fx_node[dep.name]
+            else:
+                with graph.inserting_before(first_node):
+                    dep_node = graph.placeholder(dep.name)
+                    buf_to_fx_node[dep.name] = dep_node
+            if dep_node == fx_node:  # to avoid cycles
+                continue
+            new_args.append(dep_node)
+        fx_node.args = tuple(new_args)
+    graph.output(outputs[0] if len(outputs) == 1 else tuple(outputs))
+    return graph
+def update_orig_fx_node_name_to_buf_name(
+    nodes: Optional[SchedulerNodeList],
+    node_name_to_buf_name: Dict[str, str],
+    parent_buf_name: Optional[str] = None,
+    n_origins: int = 0,
+) -> None:
+    if nodes is None:
+        return
+    for node in nodes:
+        # for FusedSchedulerNode, traverse recursively into get_nodes()
+        buf_name = node.get_name()
+        children_nodes = node.get_nodes()
+        if children_nodes is not None and len(children_nodes) > 1:
+            update_orig_fx_node_name_to_buf_name(
+                children_nodes,
+                node_name_to_buf_name,
+                buf_name if parent_buf_name is None else parent_buf_name,
+            )
+            continue
+        else:
+            assert len(children_nodes) == 1 and children_nodes[0] == node
+        ir_node = node.node
+        if ir_node is None or ir_node.origins is None:
+            continue
+        for origin in ir_node.origins:
+            node_name = origin.name
+            # when buf1 and buf2 both have origin=node1
+            # we draw node1 according to buf1
+            if node_name not in node_name_to_buf_name:
+                node_name_to_buf_name[node_name] = (
+                    buf_name if parent_buf_name is None else parent_buf_name
+                )
+def get_node_name_to_buf_meta(
+    node_name_to_buf_name: Dict[str, str]
+) -> Dict[str, BufMeta]:
+    buf_name_to_n_node = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        if buf_name not in buf_name_to_n_node:
+            buf_name_to_n_node[buf_name] = {node_name}
+        else:
+            buf_name_to_n_node[buf_name].add(node_name)
+    node_name_to_buf_meta = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        n_node = len(buf_name_to_n_node[buf_name])
+        node_name_to_buf_meta[node_name] = BufMeta(buf_name, n_node)
+    return node_name_to_buf_meta
+def annotate_orig_fx_with_snodes(
+    gm: torch.fx.GraphModule,
+    snodes: SchedulerNodeList,
+) -> None:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+    node_name_to_buf_name: Dict[str, str] = {}
+    update_orig_fx_node_name_to_buf_name(snodes, node_name_to_buf_name)
+    if node_name_to_buf_name is None:
+        return
+    node_name_to_buf_meta = get_node_name_to_buf_meta(node_name_to_buf_name)
+    for node in gm.graph.nodes:
+        if node.name in node_name_to_buf_meta:
+            node.meta["buf_meta"] = node_name_to_buf_meta.get(node.name)
+@contextlib.contextmanager
+def enable_aot_logging() -> Iterator[None]:
+    compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+    import torch._functorch.aot_autograd
+    log = logging.getLogger(torch._functorch.aot_autograd.__name__)
+    stack = contextlib.ExitStack()
+    if not compile_debug:
+        try:
+            yield
+        finally:
+            stack.close()
+        return
+    # Enable all graphs to be logged to a file by setting the flags to True
+    # and the log level of the file logger to DEBUG
+    stack.enter_context(patch("functorch.compile.config.debug_partitioner", True))
+    path = os.path.join(get_debug_dir(), "torchinductor")
+    os.makedirs(path, exist_ok=True)
+    fh = logging.FileHandler(
+        os.path.join(
+            path,
+            f"aot_{get_aot_graph_name()}_debug.log",
+        )
+    )
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(
+        logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+    )
+    log.addHandler(fh)
+    try:
+        yield
+    finally:
+        log.removeHandler(fh)
+        stack.close()
+class DebugContext:
+    _counter = itertools.count()
+    @staticmethod
+    def create_debug_dir(folder_name: str) -> Optional[str]:
+        debug_dir = config.trace.debug_dir or get_debug_dir()
+        for n in DebugContext._counter:
+            dirname = os.path.join(
+                debug_dir,
+                "torchinductor",
+                f"{folder_name}.{n}",
+            )
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+                return dirname
+        return None
+    def __init__(self) -> None:
+        self._prof = None
+        self._path = None
+        self._stack = contextlib.ExitStack()
+    def copy(self, new_path: str) -> None:
+        if not self._path:
+            return
+        assert new_path.endswith(".debug"), new_path
+        from filelock import FileLock
+        try:
+            with FileLock(f"{new_path}.lock"):
+                if os.path.exists(new_path):
+                    shutil.rmtree(new_path)
+                shutil.copytree(self._path, new_path)
+        except OSError:
+            log.warning(
+                "Failed to copy debug files from %s to %s", self._path, new_path
+            )
+    def fopen(
+        self,
+        filename: str,
+        write_mode: str = "w",
+        *args: Any,
+        **kwargs: Any,
+    ) -> IO[Any]:
+        assert self._path
+        return open(os.path.join(self._path, filename), write_mode, *args, **kwargs)
+    @contextlib.contextmanager
+    def fopen_context(
+        self,
+        filename: str,
+        write_mode: str = "w",
+        *args: Any,
+        **kwargs: Any,
+    ) -> Iterator[IO[Any]]:
+        assert self._path
+        with open(os.path.join(self._path, filename), write_mode, *args, **kwargs) as f:
+            yield f
+    def filename(self, suffix: str) -> str:
+        assert self._path
+        return os.path.join(self._path, suffix)
+    def upload_tar(self) -> None:
+        if config.trace.upload_tar is not None:
+            import tarfile
+            assert self._path
+            tar_file = os.path.join(
+                self._path, f"{os.path.basename(self._path)}.tar.gz"
+            )
+            with tarfile.open(tar_file, "w:gz") as tar:
+                tar.add(self._path, arcname=os.path.basename(self._path))
+            config.trace.upload_tar(tar_file)
+    def __enter__(self) -> None:
+        if config.debug:
+            log = logging.getLogger("torch._dynamo")
+            prev_level = log.level
+            log.setLevel(logging.DEBUG)
+            def reset_log_level(level: Any) -> None:
+                log.setLevel(level)
+            self._stack.callback(reset_log_level, prev_level)
+        self._stack.enter_context(V.set_debug_handler(self))
+        if not config.trace.enabled:
+            return
+        self._path = self.create_debug_dir(get_aot_graph_name())  # type: ignore[assignment]
+        if config.trace.debug_log:
+            self._setup_log_capture("debug.log", logging.DEBUG)
+        if config.trace.info_log:
+            self._setup_log_capture("info.log", logging.INFO)
+    def _setup_log_capture(
+        self,
+        filename: str,
+        level: int,
+    ) -> None:
+        log = logging.getLogger("torch._inductor")
+        fd = self._stack.enter_context(self.fopen(filename))
+        ch = logging.StreamHandler(fd)
+        ch.setLevel(level)
+        ch.setFormatter(
+            logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+        )
+        log.addHandler(ch)
+        log.setLevel(min(log.level, level))
+        self._stack.callback(log.removeHandler, ch)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        if self._prof:
+            self._prof.disable()
+            self._save_profile_data()
+        if self._path:
+            self.upload_tar()
+            log.warning("%s debug trace: %s", get_graph_being_compiled(), self._path)
+        self._stack.close()
+    def _save_profile_data(self) -> None:
+        assert self._prof
+        self._prof.dump_stats(self.filename("compile.prof"))
+        with self.fopen("compile.stats") as fd:
+            stats = pstats.Stats(self._prof, stream=fd)
+            stats.strip_dirs()
+            stats.sort_stats("cumtime")
+            stats.print_stats(100)
+            stats.sort_stats("tottime")
+            stats.print_stats(100)
+    def __getattr__(self, name: str) -> Optional[Callable[..., None]]:
+        if config.trace.enabled and getattr(config.trace, name):
+            try:
+                return getattr(DebugFormatter(self), name)
+            except Exception:
+                log.warning("Ignoring exception in debug code", exc_info=True)
+                return None
+        else:
+            def ignored(*args: Any, **kwargs: Any) -> None:
+                pass
+            return ignored
+class DebugFormatter:
+    def __init__(self, handler: DebugContext) -> None:
+        self.fopen = handler.fopen
+        self.fopen_context = handler.fopen_context
+        self.filename = handler.filename
+        self.handler = handler
+    def fx_graph(
+        self,
+        gm: torch.fx.GraphModule,
+        inputs: List[torch.Tensor],
+    ) -> None:
+        with self.fopen("fx_graph_runnable.py") as fd:
+            save_graph_repro(fd, gm, inputs, "inductor")
+        with self.fopen("fx_graph_readable.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+    def fx_graph_transformed(
+        self,
+        gm: torch.fx.GraphModule,
+        inputs: List[torch.Tensor],
+    ) -> None:
+        with self.fopen("fx_graph_transformed.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+    def ir_pre_fusion(self, nodes: SchedulerNodeList) -> None:
+        self._write_ir("ir_pre_fusion.txt", nodes)
+    def ir_post_fusion(self, nodes: SchedulerNodeList) -> None:
+        self._write_ir("ir_post_fusion.txt", nodes)
+    def _write_ir(
+        self,
+        filename: str,
+        nodes: SchedulerNodeList,
+    ) -> None:
+        with self.fopen(filename) as fd:
+            log.info("Writing debug ir to  %s", fd.name)
+            for node in nodes:
+                fd.write(node.debug_str())
+                fd.write("\n\n\n")
+    def graph_diagram(self, nodes: SchedulerNodeList) -> None:
+        draw_buffers(nodes, fname=self.filename("graph_diagram.svg"))
+    def draw_orig_fx_graph(
+        self,
+        gm: torch.fx.GraphModule,
+        nodes: SchedulerNodeList,
+    ) -> None:
+        annotate_orig_fx_with_snodes(gm, nodes)
+        draw_graph(
+            gm,
+            fname=self.filename("orig_fx_graph_diagram.svg"),
+            clear_meta=False,
+            prog=GRAPHVIZ_COMMAND_SCALABLE,
+            parse_stack_trace=True,
+            dot_graph_shape=config.trace.dot_graph_shape,
+        )
+    def output_code(self, filename: str) -> None:
+        shutil.copy(filename, self.filename("output_code.py"))
+    def log_autotuning_results(
+        self,
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict["ChoiceCaller", float],  # type: ignore[name-defined] # noqa: F821
+        elapse: float,
+        precompile_elapse: float,
+    ) -> None:
+        import json
+        from .ir import FixedLayout
+        def build_node_info(node: ir.IRNode) -> Dict[str, str]:
+            if hasattr(node, "name"):
+                node_name = node.name
+            else:
+                node_name = ""
+            node_info = {
+                "name": node_name,
+                "type": type(node).__name__,
+            }
+            try:
+                layout = node.get_layout()
+                if isinstance(layout, FixedLayout):
+                    offset = 0
+                    try:
+                        offset = int(layout.offset)
+                    except Exception:
+                        try:
+                            offset = V.graph.sizevars.size_hint(
+                                layout.offset, fallback=0
+                            )
+                        except Exception:
+                            pass
+                    static_layout = FixedLayout(
+                        layout.device,
+                        dtype=layout.dtype,
+                        size=list(V.graph.sizevars.size_hints(layout.size)),
+                        stride=list(V.graph.sizevars.size_hints(layout.stride)),
+                        offset=offset,
+                    )
+                    node_info["layout"] = str(static_layout)
+                else:
+                    node_info["layout"] = str(node.get_layout())
+            except Exception as e:
+                pass
+            try:
+                node_info["dtype"] = str(node.get_dtype())
+            except Exception as e:
+                pass
+            try:
+                node_info["device"] = str(node.get_device())
+            except Exception as e:
+                pass
+            try:
+                node_info["stride"] = str(
+                    V.graph.sizevars.size_hints(node.get_stride())
+                )
+            except Exception as e:
+                pass
+            try:
+                node_info["size"] = str(V.graph.sizevars.size_hints(node.get_size()))
+            except Exception as e:
+                pass
+            try:
+                node_info["numel"] = str(V.graph.sizevars.size_hint(node.get_numel()))
+            except Exception as e:
+                pass
+            if hasattr(node, "data") and isinstance(node.data, ir.IRNode):
+                node_info["data"] = build_node_info(node.data)
+            return node_info
+        general_properties = {
+            "op_name": name,
+            "cuda_device_name": torch.cuda.get_device_name(),
+            "cuda_device_count": torch.cuda.device_count(),
+            "input_nodes": [build_node_info(node) for node in input_nodes],
+            "autotuning_time": elapse,
+            "precompile_time": precompile_elapse,
+        }
+        with self.fopen_context(
+            "autotuning_result_json_list.txt", "at", encoding="utf-8"
+        ) as fd:
+            for caller, time in timings.items():
+                info_dict = dict(caller.info_dict())
+                info_dict.update(general_properties)
+                info_dict["benchmark_result"] = time
+                json.dump(info_dict, fd)
+                fd.write("\n")
+@dataclasses.dataclass
+class TensorMetadataHolder:
+    tensor_metadata: TensorMetadata
+    device: torch.device
+save_args_cnt = itertools.count()
+def save_args_for_compile_fx_inner(*args: Any, **kwargs: Any) -> None:
+    """
+    This function is used to save arguments for a compile_fx_inner function call
+    to the file system.  Later on one can replay the compile_fx_inner call
+    with the saved arguments using load_args_and_run_compile_fx_inner.
+    """
+    folder = "/tmp/inductor_saved_args"
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    def handle_tensor(x: Any) -> Any:
+        """
+        Pickle FakeTensor will result in error:
+        AttributeError: Can't pickle local object 'WeakValueDictionary.__init__.<locals>.remove'
+        Convert all Tensor to metadata. This may also makes pickle faster.
+        """
+        if isinstance(x, torch.Tensor):
+            return TensorMetadataHolder(_extract_tensor_metadata(x), x.device)
+        else:
+            return x
+    args_to_save, kwargs_to_save = tree_map(handle_tensor, (args, kwargs))
+    fn_name = "compile_fx_inner"
+    path = f"{folder}/{fn_name}_{next(save_args_cnt)}.pkl"
+    with open(path, "wb") as f:
+        pickle.dump((args_to_save, kwargs_to_save), f)
+    if log.isEnabledFor(logging.DEBUG):
+        message = f"""
+Arguments for a compile_fx_inner call is saved to {path}. To replay the call,
+run the following:
+from torch._inductor.debug import load_args_and_run_compile_fx_inner
+load_args_and_run_compile_fx_inner({path!r})
+        """
+        # call print rather than log.debug. log.debug will print message
+        # prefix for each line which makes the code snippet harder to be
+        # copied.
+        # Not a big deal since the code is already been guarded by checking
+        # the log level.
+        print(message)
+def load_args_and_run_compile_fx_inner(path: str) -> Any:
+    from torch._inductor.compile_fx import compile_fx_inner
+    with open(path, "rb") as f:
+        args, kwargs = pickle.load(f)
+    def handle_tensor(x: Any) -> Any:
+        if isinstance(x, TensorMetadataHolder):
+            return torch._dynamo.testing.rand_strided(
+                x.tensor_metadata.shape,
+                x.tensor_metadata.stride,
+                x.tensor_metadata.dtype,
+                x.device,
+            )
+        else:
+            return x
+    fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+    with fake_mode, config.patch("save_args", False):
+        args, kwargs = tree_map(handle_tensor, (args, kwargs))
+        return compile_fx_inner(*args, **kwargs)

.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py ADDED Viewed

	@@ -0,0 +1,980 @@

+# mypy: allow-untyped-decorators
+import functools
+import logging
+import math
+import sys
+import typing
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch._decomp as decomp
+import torch._prims_common as utils
+import torch.ao.quantization.fx._decomposed
+from torch._decomp import (
+    core_aten_decompositions,
+    get_decompositions,
+    remove_decompositions,
+)
+from torch._decomp.decompositions import (
+    _grid_sampler_2d as decomp_grid_sampler_2d,
+    pw_cast_for_opmath,
+)
+from torch._decomp.decompositions_for_rng import extra_random_decomps
+from torch._dynamo.utils import counters
+from torch._higher_order_ops.out_dtype import out_dtype
+from torch._inductor.utils import pad_listlike
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    type_to_dtype,
+)
+from torch.fx.experimental.symbolic_shapes import definitely_true, guard_size_oblivious
+from . import config, inductor_prims
+from .utils import (
+    is_gpu,
+    needs_fallback_due_to_atomic_add_limitations,
+    use_scatter_fallback,
+)
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+quantized = torch.ops.quantized
+_quantized = torch.ops._quantized
+quantized_decomposed = torch.ops.quantized_decomposed
+inductor_decompositions = get_decompositions(
+    [
+        aten._adaptive_avg_pool2d_backward,
+        aten.addmv,
+        aten.arange,
+        aten.bitwise_and_,
+        aten.bitwise_or_,
+        aten.clamp_min_,
+        aten.dist,
+        aten.empty_like,
+        aten.flip,
+        aten.gelu,
+        aten.hardtanh,
+        aten.index_select,
+        aten.lcm,
+        aten.leaky_relu,
+        aten.linalg_vector_norm,
+        aten._log_softmax,
+        aten.max_pool2d_with_indices_backward,
+        aten._native_batch_norm_legit,
+        aten._native_batch_norm_legit_functional,
+        aten._native_batch_norm_legit_no_training,
+        aten._batch_norm_with_update,
+        aten._batch_norm_with_update_functional,
+        aten._batch_norm_no_update,
+        aten.batch_norm_backward,
+        aten.native_batch_norm,
+        aten.native_group_norm,
+        aten.native_layer_norm,
+        aten.nll_loss2d_backward,
+        aten._softmax,
+        aten.sin_,
+        aten.sqrt_,
+        out_dtype,
+        aten._to_copy,
+        aten.tril_indices,
+        aten.triu_indices,
+        aten.upsample_bilinear2d.vec,
+        quantized.linear_dynamic_fp16_unpacked_weight,
+        _quantized.wrapped_quantized_linear,
+    ]
+)
+decompositions = {**core_aten_decompositions(), **inductor_decompositions}
+# Remove unwanted decompositions included via the core ATen decompositions from
+# the Inductor decomp table.
+decomps_to_exclude = [
+    aten._unsafe_index,
+    aten._unsafe_masked_index,
+    aten._unsafe_masked_index_put_accumulate,
+    aten._scaled_dot_product_flash_attention_for_cpu.default,  # See comments in torch/_decomp/decompositions.py
+    aten._softmax_backward_data,
+    aten.clamp_max,
+    aten.clamp_min,
+    aten.glu,  # inductor lowers this directly
+    aten.select_scatter,  # need to be in the ATen graph in order for it to work with the re-inplacing pass
+    aten.slice_scatter,  # need to be in the ATen graph in order for it to work with the re-inplacing pass
+    aten.split.Tensor,  # inductor lowers this directly
+    aten.squeeze,  # inductor lowers this directly
+    aten.sum,  # inductor lowers this directly
+    aten.unbind,  # inductor lowers this directly
+]
+remove_decompositions(decompositions, decomps_to_exclude)
+def register_decomposition(
+    ops: List[Union[torch._ops.OperatorBase, torch._ops.OpOverloadPacket]]
+) -> Callable[..., Any]:
+    for op in [ops] if callable(ops) else ops:  # type: ignore[attr-defined]
+        if op in decompositions:
+            log.warning("duplicate decomp: %s", ops)
+    return decomp.register_decomposition(ops, decompositions)
+# TODO: for now, inductor doesn't handle asserts
+# because the condition is symbol -> tensor in the graph.
+@register_decomposition([aten._assert_async.msg])
+def assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
+    return
+# Following `assert_async_msg_decomp` and implement as non-op.
+@register_decomposition([aten._functional_assert_async.msg])
+def functional_assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
+    return
+@register_decomposition([aten.sym_constrain_range_for_size.default])
+def sym_constrain_range_for_size(
+    symbol: torch.SymInt,
+    *,
+    min: Optional[torch.types.Number] = None,
+    max: Optional[torch.types.Number] = None,
+) -> None:
+    return
+@register_decomposition([aten.clamp])
+@pw_cast_for_opmath
+def clamp(
+    x: torch.Tensor,
+    min: Optional[torch.types.Number] = None,
+    max: Optional[torch.types.Number] = None,
+) -> torch.Tensor:
+    if min is not None:
+        x = x.clamp_min(min)
+    if max is not None:
+        x = x.clamp_max(max)
+    return x
+@register_decomposition([aten.full])
+def full(
+    size: List[Union[int, torch.SymInt]],
+    fill_value: torch.types.Number,
+    **kwargs: Any,
+) -> torch.Tensor:
+    dtype = kwargs.get("dtype")
+    if dtype is None:
+        kwargs["dtype"] = type_to_dtype(type(fill_value))
+        return torch.full(size, fill_value, **kwargs)
+    return NotImplemented
+# Not really sure how to put this into the main library.  PrimTorch wants
+# empty_permuted to go to the prim, and typically users don't really want
+# to decompose to empty_strided (but inductor is OK with it, because we are
+# cool with strides and everything goes to empty_strided)
+@register_decomposition([aten.empty_permuted.default])
+def empty_permuted(
+    size: List[Union[int, torch.SymInt]],
+    physical_layout: List[int],
+    **kwargs: Any,
+) -> torch.Tensor:
+    perm = [0] * len(size)
+    for p, l in enumerate(physical_layout):
+        perm[l] = p
+    return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
+@register_decomposition([aten.convolution_backward])
+def convolution_backward(
+    grad_output: torch.Tensor,
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias_sizes: List[int],
+    stride: Union[int, List[int]],
+    padding: Union[int, List[int]],
+    dilation: Union[int, List[int]],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+    output_mask: List[bool],
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if not output_mask[2] or not is_gpu(grad_output.device.type):
+        return NotImplemented
+    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
+    grad_inp, grad_weight, _ = aten.convolution_backward(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        [output_mask[0], output_mask[1], False],
+    )
+    return (grad_inp, grad_weight, grad_bias)
+@register_decomposition([aten.round.decimals])
+def round_dec(x: torch.Tensor, decimals: int = 0) -> torch.Tensor:
+    ten_pow_decimals = 10.0**decimals
+    return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
+@register_decomposition([aten.bmm])
+@pw_cast_for_opmath
+def bmm(
+    self: torch.Tensor,
+    batch2: torch.Tensor,
+) -> torch.Tensor:
+    if config.coordinate_descent_tuning:
+        if guard_size_oblivious(self.shape[1] == 1) or guard_size_oblivious(
+            batch2.shape[2] == 1
+        ):
+            out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
+            return out
+    if self.device.type == "cpu":
+        if guard_size_oblivious(self.size(1) == 1) and guard_size_oblivious(
+            batch2.size(-1) == 1
+        ):
+            counters["inductor"]["decompose_bmm"] += 1
+            return torch.sum(
+                self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True
+            ).unsqueeze(1)
+    return NotImplemented
+@register_decomposition([aten.addmm])
+@pw_cast_for_opmath
+def addmm(
+    self: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    beta: torch.types.Number = 1,
+    alpha: torch.types.Number = 1,
+) -> torch.Tensor:
+    if self.device.type == "cpu":
+        if guard_size_oblivious(mat1.size(0) == 1) and guard_size_oblivious(
+            mat2.size(-1) == 1
+        ):
+            counters["inductor"]["decompose_addmm"] += 1
+            out = torch.sum(
+                mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+            return alpha * out + beta * self
+        if (
+            guard_size_oblivious(mat1.size(0) == 1)
+            and definitely_true(mat2.size(0) <= 16)
+            and definitely_true(mat2.size(1) <= 16)
+        ):
+            counters["inductor"]["decompose_addmm"] += 1
+            out = (mat1.T * mat2).sum(dim=0, keepdim=True)
+            return alpha * out + beta * self
+    return NotImplemented
+@register_decomposition([aten.mm])
+@pw_cast_for_opmath
+def mm(
+    self: torch.Tensor,
+    input2: torch.Tensor,
+) -> torch.Tensor:
+    # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
+    # todo: Look into why and fix it (hopefully)
+    if config.coordinate_descent_tuning:
+        if guard_size_oblivious(self.shape[0] == 1) or guard_size_oblivious(
+            input2.shape[1] == 1
+        ):
+            return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
+    if self.device.type == "cpu":
+        if (
+            guard_size_oblivious(self.size(-1) == 1)
+            and guard_size_oblivious(self.size(0) > 0)
+            and guard_size_oblivious(input2.size(0) == 1)
+            and (self.dtype == input2.dtype)
+            and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
+        ):
+            counters["inductor"]["decompose_mm"] += 1
+            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+        if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
+            input2.size(-1) == 1
+        ):
+            counters["inductor"]["decompose_mm"] += 1
+            return torch.sum(
+                self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+    return NotImplemented
+# This pass does two things:
+# - Eliminate cat when there is only one tensor input
+# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
+#   don't remove ALL empty tensors, only the naughty ones)
+@register_decomposition([aten.cat.default])
+def cat(
+    tensors: List[torch.Tensor],
+    dim: int = 0,
+) -> torch.Tensor:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    def non_empty_tensor(x: torch.Tensor) -> bool:
+        # For better or worse, this is a valid cat:
+        #
+        #   torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
+        #
+        # We'd like to eliminate naughtiness like this for downstream passes
+        # like split_cat.  The easiest way is to just drop such inputs
+        # (guarding that they are non-zero).
+        #
+        # Is it permissible for this filtering to be size-oblivious?  A case
+        # where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
+        # happened to be zero, we would have liked to have filtered it out.
+        # But actually, the ONLY way this could have passed is if u0 == 0,
+        # so by the time we get here we have already installed a deferred
+        # runtime assert forcing u0 to be zero.  So if this hasn't happened,
+        # we know that the unbacked SymInt has appropriate size and there are
+        # no problems.
+        if len(x.shape) == 1 and guard_size_oblivious(x.shape[0] == 0):
+            return False
+        if dim < len(x.shape) and guard_size_oblivious(x.shape[dim] == 0):
+            return False
+        return True
+    filtered_tensors = list(filter(non_empty_tensor, tensors))
+    if len(filtered_tensors) == 1:
+        return filtered_tensors[0].clone()
+    elif 1 < len(filtered_tensors) < len(tensors):
+        # on the first call, when we remove empty tensors, we redispatch recursively
+        return aten.cat.default(filtered_tensors, dim)
+    # optimization, avoid concat for single, repeated input
+    if len(filtered_tensors) > 1 and all(
+        t is filtered_tensors[0] for t in filtered_tensors
+    ):
+        inp = filtered_tensors[0]
+        shape = list(inp.shape)
+        dim = dim + len(inp.shape) if dim < 0 else dim
+        shape.insert(dim, len(filtered_tensors))
+        return inp.unsqueeze(dim).expand(*shape).flatten(dim, dim + 1).clone()
+    # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
+    return NotImplemented
+@register_decomposition([aten.angle])
+def angle(x: torch.Tensor) -> torch.Tensor:
+    if x.is_complex():
+        return torch.where(
+            torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real)
+        )
+    # when x is real number
+    #   if x >= 0, return 0
+    #   if x < 0, return pi
+    #   if x is nan, return nan
+    _, dtype = elementwise_dtypes(
+        x,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device)
+    ret = torch.where(x < 0, pi, 0.0)
+    return torch.where(torch.isnan(x), float("nan"), ret)
+@register_decomposition([aten.add])
+def add(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    *,
+    alpha: Optional[torch.types.Number] = None,
+) -> torch.Tensor:
+    # Require both x and y to be complex tensors.
+    x_is_complex_tensor = torch.is_tensor(x) and x.is_complex()
+    y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
+    if not x_is_complex_tensor or not y_is_complex_tensor:
+        return NotImplemented
+    z = y
+    if alpha is not None:
+        z = alpha * y
+    complex_type = torch.promote_types(x.dtype, y.dtype)
+    # For complex typed `x`, `x.view(x.real.dtype)` doubles the last dimension and can cause problem
+    # when broadcasting the add.
+    def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
+        """Reshape tensor from [*initial_dims, last_dim] to *initial_dims, last_dim/2, 2]"""
+        # Get the current shape of the tensor
+        *initial_dims, last_dim = tensor.shape
+        # Check if the last dimension is even. We should never reach here since `x.view(x.real.dtype)`
+        # doubles the last dimension for complex numbers.
+        if last_dim % 2 != 0:
+            raise AssertionError(
+                "The size of the last dimension must be even to reshape it to [..., last_dim/2, 2]"
+            )
+        # Reshape the tensor
+        new_shape = (*initial_dims, last_dim // 2, 2)
+        reshaped_tensor = tensor.view(new_shape)
+        return reshaped_tensor
+    x_reshaped = reshape_tensor_complex(x.view(x.real.dtype))
+    z_reshaped = reshape_tensor_complex(z.view(y.real.dtype))
+    result = torch.flatten(x_reshaped + z_reshaped, start_dim=-2).view(complex_type)
+    return result
+@register_decomposition([aten.conj_physical])
+def conj_physical(self: torch.Tensor) -> torch.Tensor:
+    assert not self.is_complex(), "TODO: implement this"
+    return self
+@register_decomposition([aten.lift, aten.detach_])
+def lift(self: torch.Tensor) -> torch.Tensor:
+    return self
+@register_decomposition([aten.bernoulli.default])
+def bernoulli(
+    self: torch.Tensor,
+    *,
+    generator: Optional[torch.Generator] = None,
+) -> torch.Tensor:
+    assert generator is None
+    return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
+@register_decomposition([aten.fmin, prims.fmin])
+def fmin(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    return torch.where(torch.isnan(other) | (other > self), self, other)
+@register_decomposition([aten.fmax, prims.fmax])
+def fmax(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    return torch.where(torch.isnan(other) | (other < self), self, other)
+@register_decomposition(aten.amax)
+def amax(
+    self: torch.Tensor,
+    dim: Optional[int] = None,
+    keepdim: bool = False,
+) -> torch.Tensor:
+    if self.dtype == torch.bool:
+        return torch.any(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+@register_decomposition(aten.amin)
+def amin(
+    self: torch.Tensor,
+    dim: Optional[int] = None,
+    keepdim: bool = False,
+) -> torch.Tensor:
+    if self.dtype == torch.bool:
+        return torch.all(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+@register_decomposition([aten.narrow_copy])
+def narrow_copy(
+    self: torch.Tensor,
+    dim: int,
+    start: int,
+    length: int,
+) -> torch.Tensor:
+    return torch.narrow(self, dim, start, length).clone()
+@register_decomposition([aten.view_copy.default])
+def view_copy_default(
+    self: torch.Tensor,
+    size: List[Union[int, torch.SymInt]],
+) -> torch.Tensor:
+    return aten.view(self, size).clone()
+@register_decomposition([aten.view_copy.dtype])
+def view_copy_dtype(
+    self: torch.Tensor,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return self.to(dtype).clone()
+def get_like_layout(
+    tensor: torch.Tensor,
+    memory_format: Optional[torch.memory_format] = None,
+) -> torch.memory_format:
+    # TODO: _to_copy tensor to stride permutation
+    if memory_format is torch.preserve_format or memory_format is None:
+        return utils.suggest_memory_format(tensor)
+    else:
+        return memory_format
+@register_decomposition(aten.rand_like)
+def rand_like(
+    self: torch.Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return torch.rand(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randn_like)
+def randn_like(
+    self: torch.Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return torch.randn(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.full_like)
+def full_like(
+    self: torch.Tensor,
+    fill_value: Union[int, float],
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> torch.Tensor:
+    return torch.full(
+        [*self.size()],
+        fill_value,
+        dtype=dtype or self.dtype,
+        layout=layout or self.layout,
+        device=device or self.device,
+        requires_grad=requires_grad,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint_like.default)
+def randint_like(
+    self: torch.Tensor,
+    high: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return aten.randint.low(
+        0,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint_like.low_dtype)
+def randint_like_low(
+    self: torch.Tensor,
+    low: int,
+    high: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    **kwargs: Any,
+) -> torch.Tensor:
+    return aten.randint.low(
+        low,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+@register_decomposition(aten.randint.default)
+def randint(
+    high: int,
+    size: List[Union[int, torch.SymInt]],
+    **kwargs: Any,
+) -> torch.Tensor:
+    return aten.randint.low(0, high, size, **kwargs)
+@register_decomposition(quantized.linear_dynamic_fp16_unpacked_weight.default)
+def linear_dynamic_fp16_unpacked_weight(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+) -> torch.Tensor:
+    packed_weight = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16(weight)
+    return torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight(
+        input, packed_weight, bias, weight.size()[0]
+    )
+@register_decomposition(_quantized.wrapped_quantized_linear.default)
+def wrapped_quantized_linear(
+    input: torch.Tensor,
+    input_scale: torch.Tensor,
+    input_zero_point: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    bias: torch.Tensor,
+    out_scale: torch.Tensor,
+    out_zero_point: torch.Tensor,
+    out_channel: int,
+) -> torch.Tensor:
+    packed_weight = torch.ops._quantized._wrapped_linear_prepack(
+        weight, weight_scale, weight_zero_point, bias
+    )
+    return torch.ops._quantized._wrapped_quantized_linear_prepacked(
+        input,
+        input_scale,
+        input_zero_point,
+        packed_weight,
+        out_scale,
+        out_zero_point,
+        out_channel,
+    )
+@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
+def q_embedding_bag_byte_unpack_decomp(packed: torch.Tensor) -> torch.Tensor:
+    def bitcast_u8_to_f32(u8: torch.Tensor) -> torch.Tensor:
+        x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3))
+        if sys.byteorder == "little":
+            return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None]
+        else:
+            return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None]
+    scales = bitcast_u8_to_f32(packed[..., -8:-4])
+    offsets = bitcast_u8_to_f32(packed[..., -4:])
+    return packed[..., :-8].to(torch.float32) * scales + offsets
+@register_decomposition([aten.grid_sampler_2d])
+@pw_cast_for_opmath
+def grid_sampler_2d(
+    a: torch.Tensor,
+    grid: torch.Tensor,
+    interpolation_mode: int = 0,
+    padding_mode: int = 0,
+    align_corners: bool = False,
+) -> torch.Tensor:
+    # We do not expand the grid (_expand_grid=False) on cpu for performance reasons
+    # Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
+    # and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
+    # However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
+    # Thus we apply this hack to not expand the grid for this case.
+    _expand_grid = not (
+        a.device == torch.device("cpu")
+        and interpolation_mode == 0
+        and a.is_contiguous(memory_format=torch.contiguous_format)
+    )
+    output = decomp_grid_sampler_2d(
+        a,
+        grid=grid,
+        interpolation_mode=interpolation_mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+        _expand_grid=_expand_grid,
+    )
+    return output
+@register_decomposition(aten._foreach_addcmul.Scalar)
+def _foreach_addcmul_scalar(
+    self: List[torch.Tensor],
+    left_tensors: List[torch.Tensor],
+    right_tensors: List[torch.Tensor],
+    scalar: float = 1,
+) -> List[torch.Tensor]:
+    return aten._foreach_add.List(
+        self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar
+    )
+@register_decomposition(aten._foreach_addcdiv.Scalar)
+def _foreach_addcdiv_scalar(
+    self: List[torch.Tensor],
+    left_tensors: List[torch.Tensor],
+    right_tensors: List[torch.Tensor],
+    scalar: float = 1,
+) -> List[torch.Tensor]:
+    return aten._foreach_add.List(
+        self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar
+    )
+@register_decomposition(aten._foreach_lerp.Scalar)
+def _foreach_lerp_scalar(
+    start_tensors: List[torch.Tensor],
+    end_tensors: List[torch.Tensor],
+    weight: torch.types.Number,
+) -> List[torch.Tensor]:
+    return aten._foreach_add.List(
+        start_tensors,
+        aten._foreach_mul.Scalar(
+            aten._foreach_sub.List(end_tensors, start_tensors), weight
+        ),
+    )
+@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
+@register_decomposition(aten.miopen_batch_norm)
+def miopen_batch_norm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    running_mean: typing.Optional[torch.Tensor],
+    running_var: typing.Optional[torch.Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+    if training:
+        return (a, b, c)
+    return (
+        a,
+        weight.new_zeros((0,)),
+        weight.new_zeros((0,)),
+    )
+@functools.lru_cache(None)
+def fast_random_decomps() -> Dict[Any, Callable[..., Any]]:
+    return {**decompositions, **extra_random_decomps}
+# TODO(aakhundov): replace this (and the above) Any by more
+# specific type and fix all the cascading mypy errors
+def select_decomp_table() -> Dict[Any, Callable[..., Any]]:
+    """decomps can change based on config"""
+    if config.fallback_random:
+        return decompositions
+    return fast_random_decomps()
+@register_decomposition(aten.masked_scatter)
+def masked_scatter(
+    self: torch.Tensor,
+    mask: torch.Tensor,
+    source: torch.Tensor,
+) -> torch.Tensor:
+    from .codegen.common import BackendFeature, has_backend_feature
+    if has_backend_feature(self.device, BackendFeature.MASKED_SCATTER_WITH_INDEX):
+        # This two-step algorithm is the same as eager CUDA, for eager CPU we
+        # use a 1-shot serial iteration.
+        self, mask = aten.broadcast_tensors([self, mask])
+        source_idx = mask.reshape(-1).cumsum(0) - 1
+        self_flat, mask_flat, source_flat = (x.flatten() for x in (self, mask, source))
+        result = aten._unsafe_masked_index(source_flat, mask_flat, [source_idx], 0)
+        return torch.where(mask_flat, result, self_flat).view(self.shape)
+    return NotImplemented
+@register_decomposition(quantized_decomposed.choose_qparams.tensor)
+def choose_qparams_tensor(
+    input: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    eps: float,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    min_val, max_val = torch.aminmax(input)
+    scale = (max_val - min_val) / float(quant_max - quant_min)
+    scale = torch.max(scale, torch.Tensor([eps]))
+    zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
+    zero_point = torch.clamp(zero_point, quant_min, quant_max)
+    return scale.to(torch.float64), zero_point.to(torch.int64)
+@register_decomposition(aten.put)
+def put(
+    self: torch.Tensor,
+    index: torch.Tensor,
+    source: torch.Tensor,
+    accumulate: bool = False,
+) -> torch.Tensor:
+    flattened = self.flatten()
+    flattened = torch.index_put(
+        flattened, [index], source.reshape(index.shape), accumulate
+    )
+    return flattened.reshape(self.shape)
+@register_decomposition(aten.put_)
+def put_(
+    self: torch.Tensor,
+    index: torch.Tensor,
+    source: torch.Tensor,
+    accumulate: bool = False,
+) -> torch.Tensor:
+    out = aten.put(self, index, source, accumulate=accumulate)
+    return self.copy_(out)
+@register_decomposition(aten._softmax_backward_data.default)
+@pw_cast_for_opmath
+def _softmax_backward_data(
+    grad_output: torch.Tensor,
+    output: torch.Tensor,
+    dim: int,
+    input_dtype: torch.dtype,
+) -> torch.Tensor:
+    new_grad_output = grad_output * output
+    sum_new_grad = torch.sum(new_grad_output, dim=dim, keepdim=True)
+    # grad_input = new_grad_output - output * sum_new_grad
+    grad_input = inductor_prims.fma(-output, sum_new_grad, new_grad_output)
+    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
+    # if grad_output.device == torch.device("cpu"):
+    #     return grad_input.contiguous()
+    if grad_output.dtype != input_dtype:
+        grad_input = grad_input.to(input_dtype)
+    return grad_input.contiguous()
+@register_decomposition(aten.index_reduce)
+def index_reduce(
+    self: torch.Tensor,
+    dim: int,
+    index: torch.Tensor,
+    src: torch.Tensor,
+    reduction_type: str,
+    *,
+    include_self: bool = True,
+) -> torch.Tensor:
+    if reduction_type == "mean" and not needs_fallback_due_to_atomic_add_limitations(
+        self.dtype
+    ):
+        true_division = self.dtype.is_floating_point or self.dtype.is_complex
+        ones = torch.ones_like(src)
+        if include_self:
+            out = self
+            counts = torch.ones_like(self).index_add(dim, index, ones)
+        else:
+            out = self.index_fill(dim, index, 0)
+            counts = torch.zeros_like(self).index_add(dim, index, ones)
+            counts = counts.masked_fill(counts < 1, 1)
+        out = out.index_add(dim, index, src)
+        return out / counts if true_division else out // counts
+    if use_scatter_fallback(
+        aten.scatter_reduce_.two,
+        reduction_type,
+        self.dtype,
+        src.dtype,
+        src.device.type,
+        True,
+    ):
+        return NotImplemented
+    repeats = self.shape[dim + 1 :].numel() * self.shape[:dim].numel()
+    index_shape = (index.numel(), *self.shape[dim + 1 :], *self.shape[:dim])
+    perm = (*range(self.ndim - dim, self.ndim), 0, *range(1, self.ndim - dim))
+    scatter_index = (
+        index.to(torch.int64)
+        .repeat_interleave(repeats)
+        .reshape(index_shape)
+        .permute(perm)
+    )
+    return self.scatter_reduce(
+        dim,
+        scatter_index,
+        src,
+        reduction_type,
+        include_self=include_self,
+    )
+@register_decomposition(aten.max_pool2d_with_indices)
+def max_pool2d_with_indices(
+    x: torch.Tensor,
+    kernel_size: List[int],
+    stride: Optional[Union[int, List[int]]] = None,
+    padding: Union[int, List[int]] = 0,
+    dilation: Union[int, List[int]] = 1,
+    ceil_mode: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if dilation == 1:
+        dilation = [1, 1]
+    if padding == 0:
+        padding = [0, 0]
+    if not stride:
+        stride = kernel_size
+    kernel_size = pad_listlike(kernel_size, 2)
+    dilation = pad_listlike(dilation, 2)
+    padding = pad_listlike(padding, 2)
+    stride = pad_listlike(stride, 2)
+    window_size = kernel_size[0] * kernel_size[1]
+    # We fallback when using non-default dilation or when the window size is too large
+    if (
+        torch._inductor.lowering.should_fallback_max_pool2d_with_indices(
+            kernel_size, dilation
+        )
+        or window_size > torch.iinfo(torch.int8).max
+    ):
+        return NotImplemented
+    vals, offsets = prims._low_memory_max_pool2d_with_offsets(
+        x,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+    )
+    indices = prims._low_memory_max_pool2d_offsets_to_indices(
+        offsets,
+        kernel_size[1],
+        x.size(-1),
+        stride,
+        padding,
+    )
+    return vals, indices

.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py ADDED Viewed

	@@ -0,0 +1,745 @@

+# mypy: allow-untyped-defs
+import abc
+import dataclasses
+import itertools
+import logging
+import re
+import typing
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
+import sympy
+import torch
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.utils._ordered_set import OrderedSet
+from .codegen.common import index_prevent_reordering
+from .utils import (
+    get_dtype_size,
+    reduction_num_outputs,
+    sympy_index_symbol,
+    sympy_str,
+    sympy_subs,
+    VarRanges,
+)
+from .virtualized import OpsHandler, ReductionType, V
+log = logging.getLogger(__name__)
+is_indirect = re.compile(r"indirect|tmp").search
+class Dep(abc.ABC):
+    name: str
+    index: sympy.Expr
+    @abc.abstractmethod
+    def rename(self, renames: Dict[str, str]) -> "Dep":
+        pass
+    @abc.abstractmethod
+    def get_numel(self) -> sympy.Expr:
+        pass
+    @abc.abstractmethod
+    def numbytes_hint(self):
+        pass
+    @abc.abstractmethod
+    def has_unbacked_symbols(self) -> bool:
+        pass
+    @abc.abstractmethod
+    def is_contiguous(self) -> bool:
+        pass
+    def normalize_with_stride_order(self, prefix="t"):
+        return self
+@dataclasses.dataclass(frozen=True)
+class MemoryDep(Dep):
+    name: str
+    index: sympy.Expr
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+    mode: Optional[str] = None
+    def __repr__(self) -> str:
+        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges}, {self.mode})"
+    @property
+    def num_vars(self):
+        return len(self.var_names)
+    def decide_loop_order_to_match(self, other):
+        """
+        Can return None if not able to decide loop orders.
+        """
+        assert self.num_vars == other.num_vars
+        # ignore broadcast for now since broadcast causes extra 0 strides
+        # which makes it hard to decide the correct loop orders.
+        if self.num_vars != len(self.index.free_symbols):
+            return None
+        if other.num_vars != len(other.index.free_symbols):
+            return None
+        # bail out if any size is 0 or 1
+        # For size == 0, it's an empty tensor, any strides for that dimension
+        # are equivalent. Skip for simplicity and it may not matter that much.
+        #
+        # For size == 1, it cause cause tie for strides of different dimensions.
+        # Also when we first time create LoopBody in ComputedBuffer.simplify_and_reorder
+        # we can dependencies.index_vars_squeeze which should already sqeeuze
+        # the size == 1 dimensions.
+        if any(s == 0 or s == 1 for s in itertools.chain(self.size, other.size)):
+            return None
+        # Extract strides for both expression
+        self_strides = V.graph.sizevars.stride_hints(self.index, self.var_names)
+        other_strides = V.graph.sizevars.stride_hints(other.index, other.var_names)
+        # Even if the shape contains no 0/1, some complex index expression may
+        # still have duplicate stride values. Here is an example:
+        # https://gist.github.com/shunting314/511a7e1ec88aa2e1a8ec85d8445ab129
+        # We don't reorder the loop for these cases for now, but in theory
+        # we could improve the algorithm to detect the correct loop orders.
+        if len(set(self_strides)) != len(self_strides) or len(
+            set(other_strides)
+        ) != len(other_strides):
+            log.debug(
+                "unable to decide loop order. self_dep=%s v.s. other_dep=%s, self_strides=%s v.s. other_strides=%s",
+                self,
+                other,
+                self_strides,
+                other_strides,
+            )
+            return None
+        # May hanppen if self and other are as follows
+        # MemoryDep('addmm_6', 393216*d0 + 768*d1 + d2, {d0: 16, d1: 512, d2: 768}, None)
+        # MemoryDep('addmm_6', 98304*d0 + d1 + 768*d2, {d0: 64, d1: 768, d2: 128}, None)
+        if set(self_strides) != set(other_strides):
+            return None
+        stride_to_index = {s: i for i, s in enumerate(self_strides)}
+        order = []
+        for s in other_strides:
+            order.append(stride_to_index[s])
+        assert set(order) == set(range(0, self.num_vars))
+        return order
+    def get_offset(self):
+        """
+        Return the offset by setting every variable to be 0.
+        """
+        return sympy_subs(self.index, dict.fromkeys(self.var_names, 0))
+    def normalize(self) -> "MemoryDep":
+        """
+        Normalize by merging loops. The different to normalize_with_stride_order is,
+        this method does not reorder loops while normalize_with_stride_order reorder
+        loops based on stride order.
+        """
+        return MemoryDep(
+            self.name,
+            *_RecordLoadStoreInner._normalize(self.index, self.ranges),  # type: ignore[arg-type]
+            self.mode,
+        )
+    def normalize_with_stride_order(self, prefix="t"):
+        r"""
+        Used to decide if two MemoryDep does not equal due to different loop orders.
+        More specifically, when dep1 and dep2 are not equal, we can normalize
+        both and check if they are equal after that. If yes, then the mismatch is
+        caused by different loop orders.
+        """
+        # import here to avoid circular import
+        from torch._inductor import ir
+        strides = V.graph.sizevars.stride_hints(self.index, self.var_names)
+        # pick a loop order with stride ordered decreasingly
+        order = sorted(range(len(strides)), key=strides.__getitem__, reverse=True)
+        stride_reorder = ir.same_reorder(order)
+        sizes = self.size
+        var_names = self.var_names
+        new_reordered_sizes = stride_reorder(sizes)
+        new_reordered_var_names = stride_reorder(var_names)
+        new_simplified_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            new_reordered_var_names,
+            new_reordered_sizes,
+            index_prevent_reordering(
+                [self.index], new_reordered_var_names, new_reordered_sizes
+            ),
+        )
+        # now let's create new symbols with the passed in prefix
+        var_ranges, add_var = var_builder(prefix)
+        replacement = dict(
+            zip(
+                new_reordered_var_names,
+                reindex([add_var(x) for x in new_simplified_sizes]),
+            )
+        )
+        new_index = sympy_subs(sympy.expand(self.index), replacement)  # type: ignore[arg-type] # next PR
+        out = MemoryDep(self.name, new_index, tuple(var_ranges.keys()), tuple(var_ranges.values()))  # type: ignore[arg-type]
+        return out
+    @property
+    def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]:
+        """{c0: 128, c1: 512, ...}"""
+        return dict(zip(self.var_names, self.size))
+    def get_numel(self) -> sympy.Expr:
+        if self.is_indirect():
+            numel = V.graph.get_numel(self.name)
+        else:
+            vars: OrderedSet[sympy.Basic] = OrderedSet(self.index.free_symbols)
+            numel = sympy.Integer(1)
+            for var, size in zip(self.var_names, self.size):
+                if var in vars:
+                    numel = numel * size
+        return numel  # type: ignore[return-value]
+    def rename(self, renames: Dict[str, str]) -> "MemoryDep":
+        if self.name in renames:
+            return MemoryDep(
+                renames[self.name],
+                self.index,
+                var_names=self.var_names,
+                size=self.size,
+                mode=self.mode,
+            )
+        return self
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+    def is_contiguous(self) -> bool:
+        return isinstance(self.index, sympy.Symbol) and self.index in self.var_names
+    def stride1_for_last_dim(self, result_for_complex_expression=True) -> bool:
+        """
+        Whether the stride for the last dimension is 1.
+        """
+        # python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_masked_scatter_cuda_float16
+        # will exercise thru this corner case.
+        if len(self.var_names) == 0:
+            return True
+        terms = self.index.args if isinstance(self.index, sympy.Add) else [self.index]
+        last_sym = self.var_names[-1]
+        for term in terms:
+            if term is last_sym:
+                return True
+            # Having a >1 stride for the last dimension is bad for perf
+            # return False.
+            if (
+                isinstance(term, sympy.Mul)
+                and len(term.args) == 2
+                and term.args[1] is last_sym
+                and isinstance(term.args[0], (int, sympy.Integer))
+                and term.args[0] > 1
+            ):
+                return False
+        return result_for_complex_expression
+    def is_scalar(self) -> bool:
+        if isinstance(self.index, sympy.Symbol):
+            return self.index not in self.var_names and not self.is_indirect()
+        return isinstance(self.index, (int, sympy.Integer))
+    def is_indirect(self) -> bool:
+        return any(is_indirect(v.name) for v in self.index.free_symbols)  # type: ignore[attr-defined]
+@dataclasses.dataclass(frozen=True)
+class StarDep(Dep):
+    name: str
+    mode: Optional[str] = None
+    # depends on the entire buffer
+    @property
+    def index(self):
+        raise NotImplementedError("StarDep does not have an index")
+    def get_numel(self) -> sympy.Expr:
+        return V.graph.get_numel(self.name)  # type: ignore[return-value]
+    def rename(self, renames: Dict[str, str]) -> "StarDep":
+        if self.name in renames:
+            return StarDep(renames[self.name], self.mode)
+        return self
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+    def is_contiguous(self) -> bool:
+        return False
+    def is_scalar(self) -> bool:
+        return False
+    def is_indirect(self) -> bool:
+        return False
+# Used for tracking mutation ordering
+# if A reads a buffer and B mutates it
+# B must be ordered after A
+#
+# This is useful for a variety of reasons.
+# For example, if A's read is never actually used, we can eliminate it.
+# Another case is if A's buffer ends up being fused away, we never need to
+# materialize that buffer
+@dataclasses.dataclass(frozen=True)
+class WeakDep(Dep):
+    # Fake dependency on unused buffer
+    name: str
+    # Buffer that is doing the mutation
+    mutating_buf: str
+    @property
+    def index(self):
+        raise NotImplementedError("WeakDep does not have an index")
+    def get_numel(self) -> sympy.Expr:
+        return sympy.Integer(1)
+    def rename(self, renames: Dict[str, str]) -> "WeakDep":
+        if self.name in renames:
+            return WeakDep(renames[self.name], self.mutating_buf)
+        return self
+    def numbytes_hint(self):
+        return 1  # Purely inserted for ordering, not an actual dep
+    def has_unbacked_symbols(self):
+        return False
+    def is_contiguous(self) -> bool:
+        return False
+@dataclasses.dataclass(frozen=True)
+class IndexExprDep:
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+@dataclasses.dataclass
+class ReadWrites:
+    reads: OrderedSet[Dep]
+    writes: OrderedSet[Dep]
+    index_exprs: OrderedSet[IndexExprDep]
+    range_vars: Optional[List[sympy.Expr]] = None
+    var_ranges: Optional[VarRanges] = None
+    def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites":
+        return ReadWrites(
+            OrderedSet(dep.rename(renames) for dep in self.reads),
+            OrderedSet(dep.rename(renames) for dep in self.writes),
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+        )
+    def with_read(self, dep: Union[Dep, Set[Dep]]) -> "ReadWrites":
+        assert isinstance(dep, (WeakDep, StarDep, set))
+        if not isinstance(dep, set):
+            dep = {dep}
+        return ReadWrites(
+            OrderedSet.union(self.reads, dep),
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+        )
+    def merge(self, other: "ReadWrites"):
+        reads = OrderedSet.union(self.reads, other.reads)
+        writes = OrderedSet.union(self.writes, other.writes)
+        index_exprs = OrderedSet.union(self.index_exprs, other.index_exprs)
+        return ReadWrites(reads - writes, writes, index_exprs)
+    @staticmethod
+    def merge_list(read_writes: List["ReadWrites"]):
+        all_writes = OrderedSet.union(*[rw.writes for rw in read_writes])
+        all_reads = OrderedSet.union(*[rw.reads for rw in read_writes]) - all_writes
+        all_index_exprs = OrderedSet.union(*[rw.index_exprs for rw in read_writes])
+        return ReadWrites(all_reads, all_writes, all_index_exprs)
+    def remove_reads(self, rem_reads):
+        return ReadWrites(
+            self.reads - rem_reads,
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+        )
+    def reads_and_writes(self):
+        return itertools.chain(self.reads, self.writes)
+    def buffer_names(self, ignore_integer_index=True):
+        """
+        Integer index is used for load_seed.
+        """
+        names: OrderedSet[str] = OrderedSet()
+        for dep in self.reads_and_writes():
+            if not isinstance(dep, MemoryDep):
+                continue
+            if not ignore_integer_index or not isinstance(
+                dep.index, (int, sympy.Integer)
+            ):
+                names.add(dep.name)
+        return names
+class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool) -> None:
+        super().__init__()
+        self._reads: OrderedSet[Dep] = OrderedSet()
+        self._writes: OrderedSet[MemoryDep] = OrderedSet()
+        self._index_exprs: OrderedSet[IndexExprDep] = OrderedSet()
+        self._var_ranges: VarRanges = var_ranges
+        self._should_normalize: bool = normalize
+    @staticmethod
+    def drop_unused_symbols(index, var_names, sizes):
+        """
+        Reduction has last (reduced) dim in its sizes, but
+        downstream users won't.  Normalize this away.
+        """
+        if not isinstance(index, sympy.Expr):
+            # index can be an int
+            return
+        free_symbols = index.free_symbols
+        while var_names and var_names[-1] not in free_symbols:
+            var_names.pop()
+            sizes.pop()
+    @classmethod
+    def _normalize(
+        cls, index: sympy.Expr, var_ranges: VarRanges
+    ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]:
+        # Try to further simplify the indexes even if simplify_loops didn't
+        # convert it to the simplest form because of the interference from
+        # different indexing formulas.
+        index_vars = [*var_ranges.keys()]
+        sizes = tuple(var_ranges.values())  # type: ignore[assignment]
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars,
+            sizes,
+            index_prevent_reordering([index], index_vars, sizes),
+        )
+        # assign new variables each dimension to deal with numbering mismatches
+        # d0, d1, d2 could become d0, d2 -- which won't match d0, d1
+        new_vars, add_var = var_builder(canonicalization_prefix())
+        replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
+        index = sympy_subs(sympy.expand(index), replacement)
+        new_vars = [*new_vars.keys()]
+        new_sizes = [*new_sizes]
+        cls.drop_unused_symbols(index, new_vars, new_sizes)
+        return index, tuple(new_vars), tuple(new_sizes)  # type: ignore[arg-type]
+    def canonicalize(
+        self, index: sympy.Expr
+    ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]:
+        if not self._should_normalize:
+            sizes = [V.graph.sizevars.simplify(x) for x in self._var_ranges.values()]
+            var_names = [k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1]
+            sizes = [v for v in sizes if v != 1]
+            self.drop_unused_symbols(index, var_names, sizes)
+            return index, tuple(var_names), tuple(sizes)  # type: ignore[return-value, arg-type]
+        var_ranges = {
+            k: V.graph.sizevars.simplify(v)
+            for k, v in self._var_ranges.items()
+            # TODO(jansel): explore this further normalization
+            # if k in free_symbols
+        }
+        return self._normalize(index, var_ranges)
+    def load(self, name: str, index: sympy.Expr) -> str:
+        self._reads.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"load({name}, {sympy_str(index)})"
+    def load_seed(self, name: str, index: int):
+        assert isinstance(index, int)
+        return self.load(name, sympy.Integer(index))
+    def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str:
+        self._writes.add(MemoryDep(name, *self.canonicalize(index), mode=mode))
+        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
+    def store_reduction(self, name: str, index, value) -> str:
+        return self.store(name, index, f"store_reduction({value})")
+    def index_expr(self, index: sympy.Expr, dtype) -> str:
+        self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+        return f"index_expr({sympy_str(index)}, {dtype})"
+    def bucketize(
+        self,
+        values,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ):
+        self._reads.add(StarDep(offsets_name))
+        return f"bucketize({values}, {offsets_name}, {sympy_str(offsets_size)}, {indexing_dtype}, {right})"
+class RecordLoadStore(V.KernelFormatterHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool) -> None:
+        parent_handler = _RecordLoadStoreInner(
+            var_ranges=var_ranges, normalize=normalize
+        )
+        super().__init__(parent_handler=parent_handler)
+# TODO: check call sites
+def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
+    cnt = itertools.count()
+    var_ranges: VarRanges = {}
+    def add_var(length: sympy.Expr) -> sympy.Symbol:
+        v = sympy_index_symbol(f"{prefix}{next(cnt)}")
+        var_ranges[v] = length
+        return v
+    return var_ranges, add_var
+def index_vars_no_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str):
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Symbol]] = []
+    for size in argsizes:
+        args.append(list(map(add_var, size)))
+    return args, var_ranges
+def index_vars_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str = "d"):
+    from .ir import SqueezeView
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Expr]] = []
+    new_sizes: List[List[sympy.Expr]] = []
+    for size in argsizes:
+        new_size, reindex = SqueezeView.squeezer(size)
+        new_sizes.append(new_size)
+        args.append(reindex(list(map(add_var, new_size))))
+    return args, var_ranges
+def extract_read_writes(
+    fn: Callable[..., Any],
+    *argsizes: Tuple[sympy.Expr, ...],
+    normalize: bool = False,
+    prefix: str = "d",
+    hidden_args=(),
+):
+    args, var_ranges = index_vars_squeeze(*argsizes, prefix=prefix)
+    from .loop_body import LoopBody, MemoryUsageType
+    if isinstance(fn, LoopBody):
+        # Fast path to avoid tracing when we already have a LoopBody
+        inner = _RecordLoadStoreInner(var_ranges=var_ranges, normalize=normalize)
+        name_to_index = fn.indexing_from_args([*args, *hidden_args])
+        if fn.indirect_vars:
+            # mimic the `tmpX` naming tracing gives us
+            repl = {v: sympy.Symbol(f"tmp{i}") for i, v in enumerate(fn.indirect_vars)}
+            name_to_index = {k: sympy_subs(v, repl) for k, v in name_to_index.items()}
+        for entry in fn.memory_usage[MemoryUsageType.LOAD]:
+            inner.load(entry.buffer_name, name_to_index[entry.index_name])
+        for entry in fn.memory_usage[MemoryUsageType.LOAD_SEED]:
+            inner.load_seed(entry.buffer_name, int(name_to_index[entry.index_name]))
+        for entry in fn.memory_usage[MemoryUsageType.STORE]:
+            inner.store(
+                entry.buffer_name, name_to_index[entry.index_name], None, entry.mode
+            )
+        for entry in fn.memory_usage[MemoryUsageType.STORE_REDUCTION]:
+            inner.store_reduction(
+                entry.buffer_name, name_to_index[entry.index_name], None
+            )
+        for entry in fn.memory_usage[MemoryUsageType.INDEX_EXPR]:
+            inner.index_expr(name_to_index[entry.index_name], None)
+        for entry in fn.memory_usage[MemoryUsageType.BUCKETIZE]:
+            inner.bucketize(
+                None, entry.buffer_name, name_to_index[entry.index_name], None, None
+            )
+        # fn.memory_usage[MemoryUsageType.CHECK_BOUNDS] intentionally skipped
+    else:
+        # Slow path tracing the function
+        rw = RecordLoadStore(var_ranges, normalize=normalize)
+        with V.set_ops_handler(rw):
+            fn(*args, *hidden_args)
+        inner = rw.parent_handler
+    if normalize:
+        range_vars = []  # Number of vars could differ due to normalization
+    else:
+        range_vars = [*itertools.chain.from_iterable(args)]
+    return ReadWrites(
+        OrderedSet(inner._reads),
+        OrderedSet(inner._writes),
+        inner._index_exprs,
+        range_vars,
+        var_ranges,
+    )
+def extract_input_node_reduction_ranges(
+    input_node: "torch._inductor.ir.TensorBox",
+) -> Tuple[Optional[List[sympy.Expr]], Optional[List[sympy.Expr]]]:
+    """
+    Returns the size and reduction size of all inputs, if the sizes and reduction_sizes (if exist) are all the same.
+    It's possible that a node has multiple inputs, some are Reduction nodes and others are Pointwise nodes.
+    In this case, reduction_sizes of the Reduction nodes need to be the same.
+    Otherwise returns (None, None).
+    """
+    from .ir import ComputedBuffer, Loops
+    if isinstance(input_node.data, ComputedBuffer):
+        # Input node has already been realized. Return its size and reduction_size.
+        size = input_node.get_size()
+        reduction_size = input_node.get_reduction_size()
+        if len(reduction_size) > 0:
+            return (size, reduction_size)
+        else:
+            return (None, None)
+    if not isinstance(input_node.data.data, Loops):  # type: ignore[attr-defined]
+        # Other IRNodes do not have reduction_ranges.
+        return (None, None)
+    # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes?
+    # The current method still uses reduction ranges from the dependent realized node, which is not ideal.
+    # Is there a way to check whether there are permutations inbetween?
+    reads = input_node.get_reads()
+    reduction_size = None
+    size = None
+    while reduction_size is None and len(reads) > 0:
+        seen: OrderedSet[str] = OrderedSet()
+        new_reads = []
+        for read in reads:
+            if not isinstance(read, MemoryDep):
+                continue
+            if read.name in seen:
+                continue
+            seen.add(read.name)
+            buffer = V.graph.try_get_buffer(read.name)
+            if buffer is None:
+                continue
+            op = buffer.get_defining_op()
+            if op is None:
+                continue
+            if isinstance(op, ComputedBuffer) and len(op.get_reduction_size()) > 0:
+                if reduction_size is None:
+                    reduction_size = op.get_reduction_size()
+                    size = op.get_size()
+                elif reduction_size != op.get_reduction_size() or size != op.get_size():
+                    return (None, None)
+            else:
+                new_reads.extend(op.get_reads())
+        if reads == new_reads:
+            return (size, reduction_size)
+        else:
+            reads = new_reads
+    return (size, reduction_size)
+def canonicalization_prefix():
+    return "c"
+# ops handler which computes all the free unbacked symbols for an IR
+class FreeUnbackedSymbolsOpsHandler:
+    symbols: OrderedSet[sympy.Symbol]
+    def __init__(self) -> None:
+        self.symbols = OrderedSet()
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            for a in itertools.chain(args, kwargs.values()):
+                if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+                    self.symbols |= free_unbacked_symbols(a)
+        return inner
+    def indirect_indexing(
+        self, index_var, size, check=True, wrap_neg=True
+    ) -> sympy.Symbol:
+        assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+        self.symbols |= free_unbacked_symbols(size)
+        return sympy_index_symbol(f"({str(index_var)})")
+    def frexp(self, x):
+        return (None,) * 2
+    def scan(self, dtypes, combine_fn, values):
+        return (None,) * len(values)
+    def sort(self, dtypes, values, stable, descending):
+        return (None,) * len(values)
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[None, Tuple[None, ...]],
+    ) -> Union[None, Tuple[None, ...]]:
+        num_values = reduction_num_outputs(reduction_type)
+        return (None,) * num_values if num_values > 1 else None
+def _typecheck_FreeUnbackedSymbolsOpsHandler(
+    h: FreeUnbackedSymbolsOpsHandler,
+) -> OpsHandler[None]:
+    return h
+def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None):
+    from .ir import FlexibleLayout
+    args = [index, rindex] if rindex is not None else [index]
+    handler = FreeUnbackedSymbolsOpsHandler()
+    # NB: I cargo culted the allow_indexing patch here, I don't understand why
+    # people do this all over
+    with V.set_ops_handler(handler), patch.object(
+        FlexibleLayout, "allow_indexing", True
+    ):
+        fn(*args)
+    return handler.symbols

.venv/lib/python3.11/site-packages/torch/_inductor/exc.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import os
+import tempfile
+import textwrap
+from functools import lru_cache
+if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1":
+    @lru_cache(None)
+    def _record_missing_op(target):
+        with open(f"{tempfile.gettempdir()}/missing_ops.txt", "a") as fd:
+            fd.write(str(target) + "\n")
+else:
+    def _record_missing_op(target):  # type: ignore[misc]
+        pass
+class OperatorIssue(RuntimeError):
+    @staticmethod
+    def operator_str(target, args, kwargs):
+        lines = [f"target: {target}"] + [
+            f"args[{i}]: {arg}" for i, arg in enumerate(args)
+        ]
+        if kwargs:
+            lines.append(f"kwargs: {kwargs}")
+        return textwrap.indent("\n".join(lines), "  ")
+class MissingOperatorWithoutDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs) -> None:
+        _record_missing_op(target)
+        super().__init__(f"missing lowering\n{self.operator_str(target, args, kwargs)}")
+class MissingOperatorWithDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs) -> None:
+        _record_missing_op(target)
+        super().__init__(
+            f"missing decomposition\n{self.operator_str(target, args, kwargs)}"
+            + textwrap.dedent(
+                f"""
+                There is a decomposition available for {target} in
+                torch._decomp.get_decompositions().  Please add this operator to the
+                `decompositions` list in torch._inductor.decomposition
+                """
+            )
+        )
+class LoweringException(OperatorIssue):
+    def __init__(self, exc: Exception, target, args, kwargs) -> None:
+        super().__init__(
+            f"{type(exc).__name__}: {exc}\n{self.operator_str(target, args, kwargs)}"
+        )
+class SubgraphLoweringException(RuntimeError):
+    pass
+class InvalidCxxCompiler(RuntimeError):
+    def __init__(self) -> None:
+        from . import config
+        super().__init__(
+            f"No working C++ compiler found in {config.__name__}.cpp.cxx: {config.cpp.cxx}"
+        )
+class CppWrapperCodeGenError(RuntimeError):
+    def __init__(self, msg: str) -> None:
+        super().__init__(f"C++ wrapper codegen error: {msg}")
+class CppCompileError(RuntimeError):
+    def __init__(self, cmd: list[str], output: str) -> None:
+        if isinstance(output, bytes):
+            output = output.decode("utf-8")
+        super().__init__(
+            textwrap.dedent(
+                """
+                    C++ compile error
+                    Command:
+                    {cmd}
+                    Output:
+                    {output}
+                """
+            )
+            .strip()
+            .format(cmd=" ".join(cmd), output=output)
+        )
+class CUDACompileError(CppCompileError):
+    pass

.venv/lib/python3.11/site-packages/torch/_inductor/extern_node_serializer.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+from typing import List
+from torch._export.serde.aoti_schema import ExternKernelNode, ExternKernelNodes, Node
+from torch._export.serde.serialize import _dataclass_to_dict, EnumEncoder
+from torch._inductor.ir import ExternKernelNode as inductor_ExternKernelNode
+def serialize_extern_kernel_node(
+    extern_kernel_node: inductor_ExternKernelNode,
+) -> ExternKernelNode:
+    assert isinstance(extern_kernel_node.node, Node)
+    return ExternKernelNode(
+        name=extern_kernel_node.name,
+        node=extern_kernel_node.node,
+    )
+def extern_node_json_serializer(
+    extern_kernel_nodes: List[inductor_ExternKernelNode],
+) -> str:
+    serialized_nodes = ExternKernelNodes(
+        nodes=[serialize_extern_kernel_node(node) for node in extern_kernel_nodes]
+    )
+    return json.dumps(_dataclass_to_dict(serialized_nodes), cls=EnumEncoder)

.venv/lib/python3.11/site-packages/torch/_inductor/freezing.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import itertools
+import logging
+import weakref
+from typing import Any, List, Optional, Tuple
+import torch
+import torch.utils._pytree as pytree
+from torch._dynamo.utils import dynamo_timed, lazy_format_graph_code
+from torch._functorch.aot_autograd import MutationType
+from torch._functorch.compile_utils import fx_graph_cse
+from torch._inductor.constant_folding import constant_fold, replace_node_with_constant
+from torch._inductor.fx_passes.freezing_patterns import freezing_passes
+from torch._inductor.fx_passes.post_grad import view_to_reshape
+from . import config
+aten = torch.ops.aten
+prims = torch.ops.prims
+log = logging.getLogger(__name__)
+def replace_params_with_constants(
+    gm: torch.fx.GraphModule,
+    flat_params: list[Any],
+    fw_metadata: torch._functorch.aot_autograd.ViewAndMutationMeta,
+) -> List[int]:
+    """
+    Replaces the parameters of a PyTorch GraphModule with constants wherever possible.
+    Returns a list of indices representing the input parameters that were not converted to constants.
+    """
+    params = gm.graph.find_nodes(op="placeholder")
+    fake_inp_nodes = params[: len(params)]
+    preserved_arg_indices = []
+    aliased_input_args = [
+        out_info.base_idx
+        for out_info in fw_metadata.output_info
+        if out_info.base_idx is not None
+    ]
+    # TODO (tmanlaibaatar) figure out why this is different
+    # from mutated_inp_runtime_indices
+    mutated_inps = [
+        i
+        for i, m in enumerate(fw_metadata.input_info)
+        if m.mutation_type
+        in (MutationType.MUTATED_IN_GRAPH, MutationType.MUTATED_OUT_GRAPH)
+    ]
+    for i, (real_input, node) in enumerate(zip(flat_params, fake_inp_nodes)):
+        if i in mutated_inps or i in aliased_input_args:
+            preserved_arg_indices.append(i)
+            continue
+        replace_node_with_constant(gm, node, real_input)
+    # add on non param inputs
+    preserved_arg_indices.extend(range(len(flat_params), len(params)))
+    # is this necessary ?
+    gm.recompile()
+    return preserved_arg_indices
+def freeze(
+    dynamo_gm: torch.fx.GraphModule,
+    aot_autograd_gm: torch.fx.GraphModule,
+    example_inputs: List[torch._subclasses.FakeTensor],
+) -> Tuple[torch.fx.GraphModule, List[int]]:
+    """
+    Inlines parameters that are not mutated into constants and optimizes the graph through constant propagation
+    and other techniques. If enabled, the function also discards the original parameters of the module for memory efficiency.
+    Assumes that this function is run in dynamo tracing post aot_autograd.
+    Args:
+        dynamo_gm (torch.fx.GraphModule): The Dynamo constructed GraphModule.
+        aot_autograd_gm (torch.fx.GraphModule): The aot_autograd constructed GraphModule to be frozen.
+        example_inputs (List[torch.Tensor]): A list of example input tensors to be used in the freezing process.
+    Returns:
+        Tuple[torch.fx.GraphModule, List[int]]: A tuple containing the frozen GraphModule and a list of indices
+        of the inputs that were preserved (not turned into constants).
+    """
+    # We have convert conv's weight to channels last which may meet error for .view
+    # when doing fake_tensor_prop. So we need to convert view to reshape first.
+    # See the details in fx_codegen_and_compile of compile_fx.py.
+    view_to_reshape(aot_autograd_gm)
+    if tracing_context := torch._guards.TracingContext.try_get():
+        fw_metadata = tracing_context.fw_metadata
+        params_flat = tracing_context.params_flat
+        assert fw_metadata is not None and params_flat is not None
+        preserved_arg_indices = replace_params_with_constants(
+            aot_autograd_gm, params_flat, fw_metadata
+        )
+    else:
+        inputs = aot_autograd_gm.graph.find_nodes(op="placeholder")
+        preserved_arg_indices = list(range(len(inputs)))
+    # TODO - further restrict cse ? right now needed to dedup aliasing ops
+    cse_graph = fx_graph_cse(aot_autograd_gm.graph)
+    aot_autograd_gm.graph = cse_graph
+    aot_autograd_gm.recompile()
+    aot_example_inputs = [example_inputs[ind] for ind in preserved_arg_indices]
+    freezing_passes(aot_autograd_gm, aot_example_inputs)
+    constant_fold(aot_autograd_gm)
+    # invalidate nn Modules
+    if config.freezing_discard_parameters:
+        invalidate_eager_modules()
+        discard_traced_gm_params(dynamo_gm)
+    log.debug(
+        "%s", lazy_format_graph_code("FROZEN GRAPH", aot_autograd_gm, colored=True)
+    )
+    return aot_autograd_gm, preserved_arg_indices
+class ErasedTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, elem, name, owning_mod):
+        return super().__new__(cls, elem.to(device="meta"))
+    def __init__(self, elem, name: Optional[str], mod) -> None:
+        self.erased_name = name
+        self.owning_mod_ref = weakref.ref(mod)
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        erased_tensors = [
+            e
+            for e in pytree.arg_tree_leaves(*args, **kwargs)
+            if isinstance(e, ErasedTensor)
+        ]
+        assert len(erased_tensors) > 0
+        e = erased_tensors[0]
+        raise RuntimeError(
+            f"Trying to run Pytorch Eager Module after Dynamo Freezing. "
+            "The original parameters have been discarded for memory efficiency. "
+            f"Found in op {func} for erased parameter {e.erased_name} of {e.owning_mod_ref()}"
+        )
+def invalidate_eager_modules():
+    with torch.utils._python_dispatch._disable_current_modes():
+        for (
+            mod
+        ) in torch._guards.TracingContext.get().module_context.nn_modules.values():
+            if not isinstance(mod, torch.nn.Module):
+                continue
+            for attr_name, tensor in list(
+                itertools.chain(
+                    mod.named_parameters(recurse=False),
+                    mod.named_buffers(recurse=False),
+                )
+            ):
+                with torch._dispatch.python.no_python_dispatcher():
+                    e_t = ErasedTensor(tensor, attr_name, mod)
+                if isinstance(tensor, torch.nn.Parameter):
+                    e_t.requires_grad_(True)
+                    e_t._is_param = True  # type: ignore[attr-defined]
+                setattr(mod, attr_name, e_t)
+def discard_traced_gm_params(mod: torch.fx.GraphModule):
+    with torch.utils._python_dispatch._disable_current_modes():
+        for attr_name, tensor in list(
+            itertools.chain(
+                mod.named_parameters(recurse=False), mod.named_buffers(recurse=False)
+            )
+        ):
+            with torch._dispatch.python.no_python_dispatcher():
+                e_t = ErasedTensor(tensor, attr_name, mod)
+            if isinstance(tensor, torch.nn.Parameter):
+                e_t.requires_grad_(True)
+                e_t._is_param = True  # type: ignore[attr-defined]
+            setattr(mod, attr_name, e_t)
+def enforce_output_layout(gm: torch.fx.GraphModule):
+    """
+    Make sure the output node's layout does not change due to compiler optimizations
+    by adding aten.as_strided nodes with the expected strides.
+    Only used for inference so we can assume all graph outputs are model outputs.
+    """
+    *_, output_node = gm.graph.nodes
+    out_list = output_node.args[0]
+    with gm.graph.inserting_before(output_node):
+        for n in out_list:
+            if not isinstance(
+                n.meta["val"], torch.Tensor
+            ) or not torch._prims_common.is_non_overlapping_and_dense(n.meta["val"]):
+                continue
+            # add a node to enforce eager layout
+            ft = n.meta["val"]
+            new_node = gm.graph.call_function(
+                prims.inductor_force_stride_order.default, (n, ft.stride())
+            )
+            # can not call
+            # n.replace_all_uses_with(new_node)
+            # since it will replace the usage of n in new_node itself.
+            output_node.replace_input_with(n, new_node)
+    gm.graph.lint()
+    gm.recompile()
+def enforce_as_strided_input_layout(gm: torch.fx.GraphModule):
+    """
+    Make sure the as_strided node's input's layout does not change due to compiler
+    optimizations, because the as_strided strides info depends on input tensor stride info.
+    """
+    as_strided_ops = [
+        torch.ops.aten.as_strided.default,
+        torch.ops.aten.as_strided_.default,
+        torch.ops.aten.as_strided_scatter.default,
+    ]
+    strided_nodes = [n for n in gm.graph.nodes if n.target in as_strided_ops]
+    for n in strided_nodes:
+        with gm.graph.inserting_before(n):
+            # add a node to enforce eager layout
+            ft = n.args[0].meta["val"]
+            new_node = gm.graph.call_function(
+                prims.inductor_force_stride_order.default, (n.args[0], ft.stride())
+            )
+            n.replace_input_with(n.args[0], new_node)
+    gm.graph.lint()
+    gm.recompile()
+def convert_conv_weights_to_channels_last(gm: torch.fx.GraphModule):
+    """
+    Convert 4d convolution weight tensor to channels last format.
+    This pass is performed before freezing so the added nodes can be constant
+    folded by freezing.
+    """
+    with dynamo_timed("convert_conv_weights_to_channels_last"):
+        convs = [n for n in gm.graph.nodes if n.target == aten.convolution.default]
+        for conv in convs:
+            weight_node = conv.args[1]
+            if len(weight_node.meta["val"].size()) != 4 or weight_node.meta[
+                "val"
+            ].is_contiguous(memory_format=torch.channels_last):
+                # not a 4d tensor or already channels last, skip
+                continue
+            with gm.graph.inserting_before(conv):
+                new_node = gm.graph.call_function(
+                    aten.clone.default,
+                    (weight_node,),
+                    {"memory_format": torch.channels_last},
+                )
+                conv.replace_input_with(weight_node, new_node)
+        enforce_as_strided_input_layout(gm)
+        enforce_output_layout(gm)

.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# mypy: allow-untyped-defs
+import operator
+from collections import defaultdict
+from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type
+import sympy
+import torch
+import torch.fx
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    rebind_unbacked,
+    statically_known_true,
+    sym_eq,
+)
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map
+from .virtualized import V
+# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched.
+# Works for length 2 patterns with 1 module and 1 function/method.
+def matches_module_function_pattern(
+    pattern: Tuple[Type[torch.nn.modules.Module], Callable[..., Any]],
+    node: torch.fx.node.Node,
+    modules: Dict[str, torch.nn.modules.Module],
+) -> bool:
+    if len(node.args) == 0:
+        return False
+    if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+        node, torch.fx.Node
+    ):
+        return False
+    # the first node is call_module
+    if node.args[0].op != "call_module":
+        return False
+    if not isinstance(node.args[0].target, str):
+        return False
+    if node.args[0].target not in modules:
+        return False
+    if type(modules[node.args[0].target]) is not pattern[0]:
+        return False
+    # the second node is call_function or call_method
+    if node.op != "call_function" and node.op != "call_method":
+        return False
+    if node.target != pattern[1]:
+        return False
+    # make sure node.args[0] output is only used by current node.
+    if len(node.args[0].users) > 1:
+        return False
+    return True
+class FakeTensorUpdater:
+    """
+    The main idea here is that it's difficult to maintain accurate fake
+    tensors (our primary form of metadata) for each node in our graph as we
+    transform it.
+    The most reliable way to obtain this information is by rerunning
+    faketensor propagation. However, in general, faketensor propagation is
+    fairly expensive. So, instead we'd like to only rerun faketensor
+    propagation on nodes that have changed.
+    In order to detect which nodes have changed, we first hash its node,
+    target, and argument lists (which are immutable in FX).
+    Then, whenever we call incremental_update, we check which FX nodes have a
+    new hash, and recompute the faketensor metadata for that node. Then, we
+    continue to recursively compute the faketensors for all users until the
+    fake tensors stop changing.
+    """
+    def __init__(self, graph: torch.fx.Graph) -> None:
+        self.processed_hashes = set()
+        self.graph = graph
+        for node in self.graph.nodes:
+            self.processed_hashes.add(self.hash_node(node))
+    def hash_node(self, node: torch.fx.Node):
+        # todo(chilli): Not a great hash function
+        return (node, node.target, id(node.args), id(node.kwargs))
+    def incremental_update(self):
+        processed = set()
+        existing_storages: DefaultDict[Optional[int], int] = defaultdict(int)
+        for node in self.graph.nodes:
+            existing_storages[get_node_storage(node)] += 1
+        def is_intlist_same(new, old):
+            return statically_known_true(sym_eq(new, old))
+        def is_fake_tensor_same(new, old):
+            if type(new) != type(old):
+                return False
+            if isinstance(new, (list, tuple)):
+                if len(new) != len(old):
+                    return False
+                return all(
+                    is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
+                )
+            if new is None:
+                return old is None
+            if not isinstance(new, torch.Tensor):
+                assert isinstance(
+                    new, (torch.SymInt, torch.SymBool, torch.SymFloat)
+                ), f"Unknown type {type(new)} in {self.graph}"
+                return (
+                    new.node.shape_env._maybe_evaluate_static(
+                        sympy.Eq(new.node.expr, old.node.expr)
+                    )
+                    == sympy.true
+                )
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
+                return False
+            if new.device != old.device:
+                return False
+            if get_storage(new) == get_storage(old):
+                return True
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+            ):
+                return True
+            return False
+        def should_process_node(node):
+            # node.target for nodes returning true from this function
+            # are called under fake mode and does not work for inductor
+            # lowerings. We check if the node.target is an aten operator
+            # or operator.getitem which is used when returning multiple
+            # tensors from an op.
+            return node.op == "call_function" and (
+                isinstance(node.target, torch._ops.OpOverload)
+                or node.target == operator.getitem
+            )
+        to_process = set()
+        for node in self.graph.nodes:
+            if (
+                self.hash_node(node) in self.processed_hashes
+                and id(node) not in to_process
+            ):
+                continue
+            if not should_process_node(node):
+                continue
+            is_valid, args, kwargs = get_fake_args_kwargs(node)
+            if not is_valid:
+                continue
+            with V.fake_mode:
+                new_fake_tensor = node.target(*args, **kwargs)
+            if "val" in node.meta and is_fake_tensor_same(
+                new_fake_tensor, node.meta["val"]
+            ):
+                continue
+            rebind_unbacked(V.fake_mode.shape_env, node, new_fake_tensor)
+            node.meta["val"] = new_fake_tensor
+            if (shape_env := V.fake_mode.shape_env) and (
+                symbol_to_path := compute_unbacked_bindings(shape_env, new_fake_tensor)
+            ):
+                # Refresh the bindings to the new symbols
+                node.meta["unbacked_bindings"] = symbol_to_path
+            existing_storages[get_node_storage(node)] += 1
+            to_process.update([id(user) for user in node.users])
+            self.processed_hashes.add(self.hash_node(node))
+def get_storage(t: torch.Tensor) -> int:
+    return t.untyped_storage()._cdata
+def get_node_storage(node: torch.fx.Node) -> Optional[int]:
+    if "val" not in node.meta:
+        return None
+    if not isinstance(node.meta["val"], torch.Tensor):
+        return None
+    if not torch._C._has_storage(node.meta["val"]):
+        return None
+    return get_storage(node.meta["val"])
+def get_fake(x):
+    if isinstance(x, torch.fx.Node):
+        if "val" not in x.meta:
+            return x
+        return x.meta["val"]
+    return x
+def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str, Any]]:
+    """
+    First value returns a boolean if any of the input nodes don't have a faketensor.
+    """
+    args, kwargs = tree_map(get_fake, (x.args, x.kwargs))
+    if any(
+        isinstance(a, torch.fx.Node) for a in pytree.arg_tree_leaves(*args, **kwargs)
+    ):
+        return False, args, kwargs
+    return True, args, kwargs
+def is_node_realized(node: torch.fx.Node) -> bool:
+    """Returns true if a node is always realized when lowered to inductor IR.
+    NOTE: This may return some false negatives. e.g. it doesn't
+    handle buffers realized heuristically during lowering, or
+    buffers realized indirectly through view ops.
+    """
+    from torch._inductor.lowering import fallbacks, needs_realized_inputs
+    def is_buffer(node: torch.fx.Node) -> bool:
+        if node.op == "call_function" and node.target is operator.getitem:
+            # For nodes with multiple outputs, we get the fx graph:
+            #     foo = torch.ops.aten.foo(...)
+            #     getitem = foo[0]
+            #     getitem_1 = foo[1]
+            # where we need to check if foo is a fallback kernel
+            return is_buffer(node.args[0])  # type: ignore[arg-type]
+        return node.op in ("placeholder", "output") or node.target in fallbacks
+    if is_buffer(node):
+        return True
+    def realizes_inputs(node: torch.fx.Node) -> bool:
+        return node.op == "output" or node.target in needs_realized_inputs
+    if any(realizes_inputs(user) for user in node.users):
+        return True
+    # Otherwise, assume node isn't realized
+    return False

.venv/lib/python3.11/site-packages/torch/_inductor/graph.py ADDED Viewed

	@@ -0,0 +1,1930 @@

+import functools
+import itertools
+import logging
+import operator
+import os
+import re
+import sys
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from types import ModuleType
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    NoReturn,
+    Optional,
+    Sequence,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+import sympy
+from sympy import Expr
+import torch
+import torch._logging
+import torch.fx
+from torch import device, Tensor
+from torch._decomp import get_decompositions
+from torch._dynamo.utils import defake, dynamo_timed
+from torch._logging import LazyString, trace_structured
+from torch._prims_common import make_channels_last_strides_for
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx import GraphModule
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.fx.experimental.symbolic_shapes import (
+    free_unbacked_symbols,
+    has_free_symbols,
+    resolve_unbacked_bindings,
+    RuntimeAssert,
+    ShapeEnv,
+    SymTypes,
+)
+from torch.fx.graph import Graph
+from torch.fx.node import Node
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.numbers import int_oo
+from . import config, ir
+from .codegen.common import (
+    BackendFeature,
+    DeviceOpOverrides,
+    get_backend_features,
+    get_device_op_overrides,
+    get_wrapper_codegen_for_device,
+    init_backend_registration,
+)
+from .exc import (
+    CppWrapperCodeGenError,
+    LoweringException,
+    MissingOperatorWithDecomp,
+    MissingOperatorWithoutDecomp,
+)
+from .ir import (
+    Constant,
+    FixedLayout,
+    get_device_type,
+    InputBuffer,
+    Pointwise,
+    Reduction,
+    StorageBox,
+    TensorBox,
+    TorchBindObject,
+)
+from .lowering import (
+    FALLBACK_ALLOW_LIST,
+    fallback_handler,
+    fallback_node_due_to_unsupported_type,
+    lowerings,
+    make_fallback,
+    maybe_layout_constraints,
+    needs_realized_inputs,
+    unsupported_output_tensor,
+)
+from .scheduler import BaseSchedulerNode
+from .sizevars import SizeVarAllocator
+from .utils import (
+    convert_shape_to_inductor,
+    gather_origins,
+    get_cloned_parameter_buffer_name,
+    get_sympy_Expr_dtype,
+    maybe_get_suppress_shape_guards_ctx,
+    should_assume_input_aligned,
+)
+from .virtualized import NullHandler, V
+if TYPE_CHECKING:
+    from torch._higher_order_ops.effects import _EffectType
+    from .codegen.wrapper import WrapperCodeGen
+from torch._inductor.codecache import output_code_log
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+aten = torch.ops.aten
+_post_grad_graph_counter = itertools.count()
+if config.is_fbcode():
+    from torch._inductor.fb.utils import log_module_code
+else:
+    def log_module_code(*args: Any, **kwargs: Any) -> None:
+        pass
+def supported_dtype_of_cpp_wrapper(dtype: torch.device, cuda: bool) -> bool:
+    supported_dtype = {
+        torch.float32,
+        torch.float64,
+        torch.int64,
+        torch.int32,
+        torch.int16,
+        torch.int8,
+        torch.uint8,
+        torch.bool,
+        torch.bfloat16,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.float16,
+    }
+    if cuda:
+        supported_dtype.add(torch.float8_e4m3fn)
+        supported_dtype.add(torch.float8_e5m2)
+        supported_dtype.add(torch.float8_e4m3fnuz)
+        supported_dtype.add(torch.float8_e5m2fnuz)
+    return dtype in supported_dtype
+def may_get_constant_buffer_dtype(constant_buffer: sympy.Expr) -> Optional[torch.dtype]:
+    assert isinstance(
+        constant_buffer, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+    ), "get_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer"
+    if isinstance(constant_buffer, sympy.core.numbers.Integer):
+        return torch.int64
+    if isinstance(constant_buffer, sympy.Expr):
+        return get_sympy_Expr_dtype(constant_buffer)
+    if constant_buffer.is_integer:
+        return torch.int64
+    elif constant_buffer.is_float:
+        return torch.float32
+    else:
+        return None
+def is_magic_method(op: Any) -> bool:
+    magic_ops = {method_to_operator(m) for m in magic_methods}
+    return op in magic_ops
+def getattr_recursive(
+    obj: GraphModule, target: str
+) -> Union[Tensor, torch._C.ScriptObject, GraphModule]:
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+def mark_nodes_dislike_padding(
+    g: Graph, user_visible_outputs: Optional[Dict[str, None]]
+) -> None:
+    """
+    Nodes like convolution/convolution_backward want its input to be dense.
+    If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.
+    The pass finds nodes that dislike padding. These are nodes that can be reached
+    from a convolution/convolution_backward in the backward direction without
+    going thru a reduction.
+    """
+    if not config.comprehensive_padding:
+        return
+    ops_dislike_padding = {
+        aten.convolution,
+        aten.convolution_backward,
+    }
+    # what's a better way to collect the reduction ops?
+    ops_like_padding = {
+        aten.var_mean,
+        aten.sum,
+        aten.mean,
+        aten.prod,
+        aten.any,
+        aten.amin,
+        aten.amax,
+        aten.min,
+        aten.max,
+        aten.argmin,
+        aten.argmax,
+        aten.scatter_reduce,
+    }
+    def _get_overload_packet(
+        node: torch.fx.Node,
+    ) -> Optional[torch._ops.OpOverloadPacket]:
+        return (
+            node.target._overloadpacket
+            if node.op == "call_function"
+            # hasattr on OpOverloadPacket is slow, do isinstance first
+            and isinstance(node.target, torch._ops.OpOverload)
+            and hasattr(node.target, "_overloadpacket")
+            else None
+        )
+    for cur in reversed(g.nodes):
+        op = _get_overload_packet(cur)
+        if not op:
+            continue
+        if op in ops_dislike_padding:
+            cur.meta["dislike_padding"] = True
+        if cur.meta.get("dislike_padding", False):
+            # propagate
+            for prior in cur.all_input_nodes:
+                prior_op = _get_overload_packet(prior)
+                if not prior_op:
+                    continue
+                if prior_op not in ops_like_padding:
+                    prior.meta["dislike_padding"] = True
+        # We only want to mark output nodes. So, move it after the above prior nodes process.
+        if (
+            not config.pad_outputs
+            and user_visible_outputs
+            and cur.name in user_visible_outputs
+        ):
+            cur.meta["dislike_padding"] = True
+class GraphLowering(torch.fx.Interpreter):
+    graph_outputs: List[ir.IRNode]
+    def symbolic_sizes_strides(
+        self, ex: torch.Tensor
+    ) -> Tuple[Union[List[int], List[Expr]], Union[List[int], List[Expr]]]:
+        """
+        Support dynamic shapes and dynamic strides by assigning variables
+        to each dimension.  We duck-shape tensors, so if two tensors
+        have the same size they get assigned the same symbolic variable.
+        """
+        if self.reuse_shape_env:
+            return convert_shape_to_inductor(ex.size()), convert_shape_to_inductor(
+                ex.stride()
+            )
+        else:
+            from torch._dynamo.source import ConstantSource
+            # TODO: this should not be needed once #93059 lands
+            # https://github.com/pytorch/pytorch/pull/94031#discussion_r1096044816
+            # TODO: make a dedicated UnknownSource for this?
+            # NB: This is using the legacy default behavior from
+            # create_symbolic_sizes_strides_storage_offset but we hope we can
+            # just delete this entirely
+            source = ConstantSource(
+                f"__inductor_unknown_tensor_{len(self._shape_env.var_to_val)}"
+            )
+            (
+                size,
+                stride,
+                _,
+            ) = self._shape_env.create_symbolic_sizes_strides_storage_offset(
+                ex,
+                source,
+            )
+        size = [i.node.expr if isinstance(i, torch.SymInt) else i for i in size]
+        stride = [i.node.expr if isinstance(i, torch.SymInt) else i for i in stride]
+        return size, stride
+    def static_sizes_strides(
+        self, ex: torch.Tensor
+    ) -> Tuple[List[sympy.Expr], List[sympy.Expr]]:
+        """
+        Primarily used to weights
+        """
+        size = [sympy.Integer(i) for i in ex.size()]
+        stride = [sympy.Integer(i) for i in ex.stride()]
+        return size, stride
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: Optional[List[torch.Tensor]] = None,
+        shape_env: Optional[ShapeEnv] = None,
+        graph_id: Optional[int] = None,
+        cpp_wrapper: bool = False,
+        aot_mode: bool = False,
+        user_visible_outputs: Optional[Dict[str, None]] = None,
+        layout_opt: Optional[bool] = None,
+        extern_node_serializer: Optional[
+            Callable[[List[ir.ExternKernelNode]], Any]
+        ] = None,
+        is_inference: bool = False,
+        is_const_graph: bool = False,
+        const_output_index: Optional[Dict[str, int]] = None,
+        const_code: Optional[str] = None,
+        const_module: Optional["GraphLowering"] = None,
+        name: Optional[str] = None,
+    ) -> None:
+        super().__init__(gm)
+        self.example_inputs = example_inputs
+        self.layout_opt = (
+            layout_opt
+            if layout_opt is not None
+            else self.decide_layout_opt(gm, is_inference=is_inference)
+        )
+        self.num_channels_last_conv = 0
+        self.is_inference = is_inference
+        self.is_const_graph = is_const_graph
+        self.const_code = const_code
+        self.const_module = const_module
+        self.extra_traceback = False  # we do our own error wrapping
+        if shape_env is None:
+            shape_env = ShapeEnv()
+            self.reuse_shape_env = False
+        else:
+            self._shape_env = shape_env
+            self.reuse_shape_env = True
+        self._shape_env = shape_env
+        # We are going to start code generating runtime asserts, so make sure
+        # you don't start adding new ones in the lowering process
+        shape_env.freeze_runtime_asserts()
+        # We're going to mutate ras_by_symbol as we finish generating them
+        self.ras_by_symbol: Dict[
+            sympy.Symbol, List[RuntimeAssert]
+        ] = shape_env.deferred_runtime_asserts.copy()
+        self.bound_unbacked_symbols: OrderedSet[sympy.Symbol] = OrderedSet()
+        self.sizevars = SizeVarAllocator(shape_env)
+        self.graph_input_names: List[str] = []
+        self.graph_inputs: Dict[str, TensorBox] = {}
+        self.graph_inputs_original: Dict[str, InputBuffer] = {}
+        self.zero_dim_cpu_tensor_list: OrderedSet[str] = OrderedSet()
+        self.device_types: OrderedSet[str] = (
+            const_module.device_types if const_module else OrderedSet()
+        )
+        self.device_idxs: OrderedSet[int] = (
+            const_module.device_idxs if const_module else OrderedSet()
+        )
+        self.cuda = False
+        self.buffers: List[ir.Buffer] = []
+        self.operations: List[ir.Operation] = []
+        self.const_output_index: Dict[str, int] = (
+            const_output_index if const_output_index else {}
+        )
+        self.folded_constants: OrderedSet[str] = (
+            OrderedSet(const_output_index.keys())
+            if const_output_index
+            else OrderedSet()
+        )
+        self.constants: Dict[str, torch.Tensor] = (
+            const_module.constants if const_module else {}
+        )
+        self.torchbind_constants: Dict[str, torch._C.ScriptObject] = {}
+        self.constant_reprs: Dict[str, str] = {}
+        self.removed_operations: OrderedSet[str] = OrderedSet()
+        self.removed_buffers: OrderedSet[str] = OrderedSet()
+        self.removed_inplace_buffers: OrderedSet[str] = OrderedSet()
+        self.mutated_buffers: OrderedSet[str] = OrderedSet()
+        self.never_reuse_buffers: OrderedSet[str] = OrderedSet()
+        self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
+        self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
+        self.wrapper_code: WrapperCodeGen = None  # type: ignore[assignment]
+        # See `ProxyExecutor Design Note` in ir.py for more details
+        self.extern_kernel_nodes: List[ir.ExternKernelNode] = []
+        from torch._inductor.extern_node_serializer import extern_node_json_serializer
+        self.extern_node_serializer: Callable[[List[ir.ExternKernelNode]], Any] = (
+            extern_node_serializer
+            if config.is_fbcode() and extern_node_serializer
+            else extern_node_json_serializer
+        )
+        self.current_node: torch.fx.Node = None  # type: ignore[assignment]
+        self.lists: Dict[str, List[str]] = {}
+        self.mutated_inputs: OrderedSet[str] = OrderedSet()
+        self.mutated_input_idxs: List[int] = []
+        self.name_to_buffer: Dict[str, ir.Buffer] = {}
+        self.name_to_users: DefaultDict[str, List[ir.IRNode]] = defaultdict(list)
+        self.name_to_op: Dict[str, ir.Operation] = {}
+        self.creation_time = time.time()
+        self.name = name  # type: ignore[assignment]
+        self.cpp_wrapper = cpp_wrapper
+        # record multi_kernel choice for cpp_wrapper so the second pass knows
+        # which sub-kernel is picked. Copy cpp_wrapper to another variable
+        # since cpp_wrapper flag is OrderedSet to false for the first pass of codegen.
+        self.record_multi_kernel_choice = cpp_wrapper
+        self.multi_kernel_to_choice: Dict[str, int] = {}
+        self.aot_mode = aot_mode
+        self.graph_id = graph_id
+        self.post_grad_graph_id = next(_post_grad_graph_counter)
+        self.scheduler: torch._inductor.scheduler.Scheduler = None  # type: ignore[assignment]
+        self.nodes_prefer_channels_last = (
+            self.find_nodes_prefer_channels_last() if self.layout_opt else OrderedSet()
+        )
+        self._warned_fallback = {"aten.convolution_backward"}
+        self.user_visible_outputs = (
+            user_visible_outputs if user_visible_outputs is not None else {}
+        )
+        mark_nodes_dislike_padding(gm.graph, user_visible_outputs)
+        self.cache_key: str = ""  # This is the cache key for the compiled artifact
+        self.cache_path: str = ""  # This is the path in the filesystem where the compiled artifact is stored
+        self.cache_linemap: List[
+            Tuple[int, str]
+        ] = (
+            []
+        )  # This is the linemap used by the profiler to mark custom compiled kernels getting run
+        # Used if lowering encounters cases where cudagraphs are not supported
+        self.disable_cudagraphs_reason: Optional[str] = None
+        # only keeping one node per device for stack trace purposes
+        self.device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
+        self.orig_gm: torch.fx.GraphModule = gm.__copy__()
+        self.dynamo_flat_name_to_original_fqn = self.module.meta.get(
+            "dynamo_flat_name_to_original_fqn", {}
+        )
+        self.allocated_constant_name: Dict[str, str] = (
+            const_module.allocated_constant_name if const_module is not None else {}
+        )
+        init_backend_registration()
+        self.get_backend_features = functools.lru_cache(None)(get_backend_features)
+        self.effectful_ops: Dict[_EffectType, ir.Buffer] = {}
+        self.aligned_inputs: OrderedSet[str] = OrderedSet()
+        self.no_fuse_buffer_names: OrderedSet[str] = OrderedSet()
+        # Below field is related to printing debug intermediate tensor values info for debugging
+        self.all_codegen_kernel_names: OrderedSet[str] = OrderedSet()
+    def has_feature(
+        self, device: Union[torch._inductor.ir.IRNode, device], feature: BackendFeature
+    ) -> bool:
+        assert isinstance(feature, BackendFeature), feature
+        return feature in self.get_backend_features(get_device_type(device))
+    @staticmethod
+    def decide_layout_opt(gm: GraphModule, *, is_inference: bool) -> bool:
+        """
+        Decide if we should enable layout optimization for this graph based on
+        heuristics.
+        """
+        if not config.layout_optimization:
+            return False
+        if config.force_layout_optimization:
+            return True
+        conv_nodes = [
+            n for n in gm.graph.nodes if n.target == torch.ops.aten.convolution.default
+        ]
+        nconv = len(conv_nodes)
+        if nconv == 0:
+            return False
+        # For cpu backend and mkldnn enabled, we always use channels_last for better performance.
+        if (
+            torch.backends.mkldnn.enabled
+            and torch.backends.mkldnn.is_available()
+            and all(
+                n.args[idx].meta["val"].device == torch.device("cpu")
+                for n in conv_nodes
+                for idx in [0, 1]
+            )
+        ):
+            return True
+        # Following models are skipped due to this:
+        # jx_nest_base
+        # volo_d1_224
+        if len(list(gm.graph.nodes)) >= 300 * nconv:
+            log.debug("Skipped layout opt because only a few conv")
+            return False
+        if any(
+            has_free_symbols(n.args[idx].meta["val"])
+            for n in conv_nodes
+            for idx in [0, 1]
+        ):
+            log.debug(
+                "See perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670"
+            )
+            return False
+        def is_grouped(n: Any) -> bool:
+            meta_val = n.args[1].meta["val"]  # type: ignore[union-attr, operator]
+            assert isinstance(meta_val, torch.Tensor)
+            return n.args[-1] > 1 and meta_val.size(1) > 1  # type: ignore[union-attr, operator]
+        def is_in_out_channel(n: torch.fx.Node) -> bool:
+            return (
+                n.args[1].meta["val"].size(0) * 2 <= n.args[1].meta["val"].size(1)  # type: ignore[union-attr, operator]
+                and n.args[1].meta["val"].size(2) > 1  # type: ignore[union-attr, operator]
+            )
+        def is_small_channel(n: torch.fx.Node) -> bool:
+            return (
+                n.args[1].meta["val"].size(0) <= 64  # type: ignore[union-attr, operator]
+                and n.args[1].meta["val"].size(1) <= 64  # type: ignore[union-attr, operator]
+            )
+        # only grouped convolutions benchmarked as slower in conv samples for inference only
+        if is_inference:
+            from torch.utils.flop_counter import FlopCounterMode
+            flop_counts: Dict[str, float] = defaultdict(float)
+            for node in conv_nodes:
+                success, args, kwargs = torch._inductor.fx_utils.get_fake_args_kwargs(
+                    node
+                )
+                if success:
+                    with FlopCounterMode(display=False) as flop_counter_mode:
+                        with V.fake_mode:
+                            node.target(*args, **kwargs)
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    if is_grouped(node):
+                        node_type = "grouped"
+                    elif is_small_channel(node):
+                        node_type = "small"
+                    elif is_in_out_channel(node):
+                        node_type = "in_out"
+                    else:
+                        node_type = "default"
+                    flop_counts[node_type] += counted_flops
+                else:
+                    log.debug("Conv inputs meta not found")
+            # average benchmarked channels last speedup / slowdown, < 1 is speedup.
+            # taken from the set of convolution inputs in benchmarks/dynamo/microbenchmarks/operator_inp_logs/torchbench_train/
+            # To regenerate these numbers follow https://gist.github.com/eellison/55d7a6ed6f39829d68ac56f95f4df5bb
+            GROUPED_MULTIPLIER = 1.358
+            DEFAULT_MULTIPLIER = 0.823
+            IN_OUT_MULTIPLIER = 0.725
+            SMALL_MULTIPLIER = 0.783
+            total_flops = sum(flop_counts.values())
+            # TODO - get different values per hardware
+            weighted_flops = (
+                flop_counts["grouped"] * GROUPED_MULTIPLIER
+                + flop_counts["small"] * SMALL_MULTIPLIER
+                + flop_counts["in_out"] * IN_OUT_MULTIPLIER
+                + flop_counts["default"] * DEFAULT_MULTIPLIER
+            )
+            do_layout_opt = weighted_flops <= total_flops
+            if not do_layout_opt:
+                log.debug(
+                    "Skipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %d",
+                    total_flops,
+                    weighted_flops,
+                )
+            return do_layout_opt
+        # Channels last layout can dramatically hurt grouped conv perf. E.g.
+        # Conv with arguments like
+        #   {"input_shape": [32, 224, 112, 112], "weight_shape": [224, 112, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 2}
+        # slows down 31x using channels last..
+        # But a lot of timm models use depthwise separable convolution which will
+        # result in grouped convolution with in-channel size == 1.
+        # For those grouped convolution, channels last still helps a lot.
+        # E.g.
+        # Conv with arguments
+        #   {"input_shape": [128, 58, 56, 56], "weight_shape": [58, 1, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 58}
+        # get 1.86x speedup with channels last layout.
+        #
+        # The following heuristics skip using channels-last if the model contains
+        # grouped convolution with in-channels > 1.
+        if any(map(is_grouped, conv_nodes)):
+            log.debug(
+                "Skip layout opt because found grouped convolution with >1 in_channels!"
+            )
+            return False
+        # For some models that contain convolution with larger in-channel than out-channel, applying
+        # channels last hurts performance.
+        # Following models are skipped due to this:
+        # - pytorch_unet
+        # - phlippe_densenet (slightly worse)
+        # - Background_Matting (1.22x -> 0.821x)
+        # - pytorch_CycleGAN_and_pix2pix (1.597x -> 1.294x)
+        if any(map(is_in_out_channel, conv_nodes)):
+            log.debug(
+                "Skip layout opt because some convolutions have smaller out_channel"
+            )
+            return False
+        # Following models are skipped due to this:
+        # - functorch_maml_omniglot
+        if all(map(is_small_channel, conv_nodes)):
+            log.debug("Skip layout opt because all convolution channels are too small")
+            return False
+        return True
+    def qualify_name(self, name: str) -> str:
+        """Prepend the given name with the graph name if any."""
+        if self.name is not None:
+            return f"{self.name}_{name}"
+        return name
+    def make_subgraph(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        subgraph_name: str,
+    ) -> "GraphLowering":
+        """
+        Make a subgraph of the current graph with all inherited
+        parts, except the graph module (`gm`) and `example_inputs`.
+        The subgraphs are lowered separately, but intended to be
+        inlined in the parent graph's codegening. Hence the need
+        for maintaining the same `shape_env` and other properties.
+        The subgraph name is qualified by the parent graph's name.
+        """
+        return GraphLowering(
+            gm=gm,
+            example_inputs=example_inputs,
+            shape_env=self._shape_env,
+            cpp_wrapper=self.cpp_wrapper,
+            aot_mode=self.aot_mode,
+            extern_node_serializer=self.extern_node_serializer,
+            is_inference=self.is_inference,
+            name=self.qualify_name(subgraph_name),
+        )
+    def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
+        """
+        The rule to decide if an node prefer channels last is simple.
+        1. if it's input/output of a convolution
+        2. if one of its user prefers channels last
+        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
+        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
+        channels last.
+        Consider the scenario: conv -> batch-norm -> relu -> conv
+        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
+        1. the output of batch-norm should be channels last initially since its input is a conv's output.
+           Forcing the batch-norm's output to be contiguous results in the first copy
+        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
+           We need convert it to channels last layout which results in the second copy.
+        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
+        can be saved.
+        """
+        output_set: OrderedSet[Node] = OrderedSet()
+        for n in reversed(self.module.graph.nodes):
+            if n.target == torch.ops.aten.convolution.default:
+                output_set.add(n)
+                continue
+            for user in n.users:
+                if user in output_set:
+                    output_set.add(n)
+                    break
+        # need a second pass to add downstream nodes of those channel last nodes to the sets.
+        # This pass is especially needed to avoid mix-layout kernel inputs in backward pass.
+        #
+        # Let's say a conv-batchnorm 's output is passed to relu whose output is in turn returned
+        # from the fwd graph. Without this second pass, we will force relu's output to be contiguous.
+        # Then in the kernel in backward pass, the contiguous output of relu may be mix with other channels last
+        # tensors and passed to a kernel.
+        #
+        # This pass improve yolov3 training speedup from 1.116x (worse than disabling layout optimization speedup 1.196x) to 1.457x.
+        # It also improves dla102 training speedup from 1.240x (worse than disabling layout optimization speedup 1.523x) to 1.835x .
+        # This also helps the following models:
+        # - res2net101_26w_4s
+        # - res2net50_14w_8s
+        # - sebotnet33ts_256
+        for n in self.module.graph.nodes:
+            if n in output_set:
+                output_set.update(n.users)
+        return output_set
+    def warn_fallback(self, name: str) -> None:
+        if name not in self._warned_fallback:
+            self._warned_fallback.add(name)
+            perf_hint_log.info("Using FallbackKernel: %s", name)
+    def add_device_info(self, device: torch.device) -> None:
+        self.device_types.add(device.type)
+        if device.index is not None:
+            self.device_idxs.add(device.index)
+        if V.graph.current_node and device not in self.device_node_mapping:
+            self.device_node_mapping[device] = V.graph.current_node
+    @property
+    def fake_mode(self) -> torch._subclasses.fake_tensor.FakeTensorMode:
+        return V.fake_mode
+    def try_get_buffer(
+        self, buffer_name: str
+    ) -> Optional[Union[ir.TensorBox, ir.Buffer]]:
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name]
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name]
+        if buffer_name in self.constants:
+            data = V.graph.constants[buffer_name]
+            return ir.ConstantBuffer(
+                buffer_name,
+                ir.FixedLayout(
+                    data.device, data.dtype, *V.graph.static_sizes_strides(data)
+                ),
+            )
+        return None
+    def get_buffer(self, buffer_name: str) -> Union[ir.TensorBox, ir.Buffer]:
+        buf = self.try_get_buffer(buffer_name)
+        if buf is not None:
+            return buf
+        raise RuntimeError(f"Failed to find buffer matching name {buffer_name}")
+    def get_dtype(self, buffer_name: str) -> torch.dtype:
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].dtype
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name].get_dtype()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_dtype()
+        m = re.match(r"(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),", buffer_name)
+        if m:
+            return self.get_dtype(m.group(1))
+        raise KeyError(f"could not find {buffer_name}")
+    def get_numel(self, buffer_name: str) -> Union[int, Expr]:
+        from .ir import MultiOutputLayout
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].numel()
+        if buffer_name in self.name_to_buffer:
+            buf = self.name_to_buffer[buffer_name]
+            if isinstance(getattr(buf, "layout", None), MultiOutputLayout):
+                return 1
+            return buf.get_numel()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_numel()
+        raise KeyError(f"could not find {buffer_name}")
+    def run(self, *args: Any) -> Any:  # type: ignore[override]
+        with dynamo_timed("GraphLowering.run"):
+            return super().run(*args)
+    def register_operation(self, op: ir.Operation) -> str:
+        assert op.operation_name is None, f"Operation registered twice: {op}"
+        assert isinstance(op, ir.Operation)
+        name = self.qualify_name(f"op{len(self.operations)}")
+        self.operations.append(op)
+        self.name_to_op[name] = op
+        op.operation_name = name
+        return name
+    def register_buffer(self, buffer: ir.Buffer, *, set_name: bool = False) -> str:
+        name = self.qualify_name(f"buf{len(self.buffers)}")
+        self.buffers.append(buffer)
+        self.name_to_buffer[name] = buffer
+        if (
+            # Skip empty CPU tensor so that CUDA graphs can succeed, see https://github.com/pytorch/pytorch/pull/114144
+            not (isinstance(buffer, ir.ComputedBuffer) and buffer.is_zero_elements())
+            and buffer.get_device() is not None
+        ):
+            self.add_device_info(buffer.get_device())
+        if set_name:
+            buffer.name = name
+        return name
+    def register_operation_list(self, operation_names: List[str]) -> str:
+        name = self.qualify_name("list_" + "_".join(operation_names))
+        self.lists[name] = operation_names
+        return name
+    def register_users_of(
+        self, node_output: Union[Iterable[ir.IRNode], ir.IRNode]
+    ) -> None:
+        def register(value: Union[Iterable[ir.IRNode], ir.IRNode]) -> None:
+            if isinstance(value, (list, tuple)):
+                for x in value:
+                    register(x)
+            if isinstance(value, ir.TensorBox):
+                for read_name in value.get_read_names():
+                    self.name_to_users[read_name].append(value)
+        register(node_output)
+    def mark_buffer_mutated(self, name: str) -> None:
+        """
+        When a buffer is mutated we need to make sure all the reads to
+        the old version are realized before the mutation happens.
+        """
+        assert isinstance(name, str)
+        self.mutated_buffers.add(name)
+        if name not in self.name_to_users:
+            return
+        for user in self.name_to_users[name]:
+            user.realize()
+    def get_original_value_of_constant(self, name: str) -> torch.Tensor:
+        """
+        In AOTI, module buffers may have been mutated during the tracing and compilation.
+        Thus we need to read from previously stored original buffers, to make sure the
+        generated model.so uses correct initial values.
+        """
+        assert name in self.allocated_constant_name and name in self.constants, (
+            "Can not find the original value for " + name
+        )
+        orig_name = get_cloned_parameter_buffer_name(self.allocated_constant_name[name])
+        return (
+            self.module.meta[orig_name]
+            if orig_name in self.module.meta
+            else self.constants[name]
+        )
+    def allocate_non_dup_const_name(
+        self, name: Optional[str], data: Union[Tensor]
+    ) -> str:
+        orig_name = name
+        if not config.aot_inductor.use_runtime_constant_folding:
+            for constant_name, value in self.constants.items():
+                if (
+                    not data.is_mkldnn
+                    and data.size() == value.size()
+                    and data.stride() == value.stride()
+                    and data.dtype == value.dtype
+                    and data.device == value.device
+                    and data.untyped_storage().data_ptr()
+                    == value.untyped_storage().data_ptr()
+                    and data.storage_offset() == value.storage_offset()
+                ):
+                    return constant_name
+        if name is None:
+            name = f"constant{len(self.constants)}"
+        assert name is not None
+        if name[0].isdigit():
+            name = f"constant_{name}"
+        name = self.qualify_name(name)
+        # We may generate a var name for each constant in the codegen.
+        # Let's only keep sane characters.
+        prefix = re.sub(r"[^a-zA-Z0-9_]", "_", name)
+        name = prefix
+        cnt = 0
+        while name in self.constants:
+            name = f"{prefix}_{cnt}"
+            cnt += 1
+        self.constants[name] = data
+        self.constant_reprs[name] = (
+            f"{data.device!r} {data.dtype!r} "
+            f"{tuple(data.size())!r} {tuple(data.stride())!r} "
+            f"{hash(data):x}"
+        )
+        self.allocated_constant_name[name] = orig_name  # type: ignore[assignment]
+        return name
+    def add_tensor_constant(
+        self, data: Tensor, name: Optional[str] = None
+    ) -> TensorBox:
+        new_name = self.allocate_non_dup_const_name(name, data)
+        return TensorBox.create(
+            ir.ConstantBuffer(
+                new_name,
+                FixedLayout(data.device, data.dtype, *self.static_sizes_strides(data)),
+            )
+        )
+    def constant_name(self, name: str, device_override: Optional[torch.device]) -> str:
+        """
+        We AOT copy constants to the devices they are needed on.
+        If device_override doesn't match the constant's device, then
+        copy it and return a different name.
+        """
+        if self.constants[name].device == device_override or device_override is None:
+            return name
+        with torch.utils._python_dispatch._disable_current_modes():
+            # caller might have OrderedSet fake tensor mode which will create a fake tensor
+            # when calling .to, so unset modes here
+            return self.allocate_non_dup_const_name(
+                f"{name}_{device_override.type}{device_override.index or 0}",
+                self.constants[name].to(device_override),
+            )
+    def placeholder(
+        self, target: str, args: Tuple[object], kwargs: Dict[str, object]  # type: ignore[override]
+    ) -> Union[Expr, TensorBox, None]:
+        example = super().placeholder(target, args, kwargs)  # type: ignore[arg-type]
+        self.graph_input_names.append(target)
+        if isinstance(example, SymTypes):
+            expr = example.node.expr
+            self.graph_inputs[target] = expr
+            return expr
+        elif isinstance(example, (int, bool, float)):
+            expr = sympy.sympify(example)
+            self.graph_inputs[target] = expr
+            return expr
+        elif example is None:
+            return None
+        if isinstance(example, BackwardState):
+            # Ignored arg, must be unused
+            # Alternately we could filter this out in AotAutograd
+            return None
+        assert isinstance(example, torch.Tensor), example
+        # todo(chilli): We can remove the last check once we turn buffers into
+        # static shape tensors. That's a hack to workaround Inductor believing
+        # the buffer should be static but us passing in a fake tensor with
+        # symbolic shapes.
+        if not example._has_symbolic_sizes_strides:
+            # the first N inputs are weights
+            sizes, strides = self.static_sizes_strides(example)
+        else:
+            sizes, strides = self.symbolic_sizes_strides(example)  # type: ignore[assignment]
+        # TODO(jansel): handle input aliasing
+        target = self.qualify_name(target)
+        tensor = TensorBox.create(
+            InputBuffer(
+                target,
+                FixedLayout(example.device, example.dtype, sizes, strides),
+            )
+        )
+        self.graph_inputs[target] = tensor
+        self.graph_inputs_original[target] = tensor.data.data
+        if self.current_node.users:  # cudagraphs should work with an unused CPU input
+            self.add_device_info(example.device)
+        # Note: [Input Alignment handling in Inductor]
+        # Alignment matters for generating efficient code. Some operations,
+        # e.g. vectorized loads, can only be performed on aligned inputs.
+        #
+        # But if we codegen assuming aligned inputs and then get unaligned
+        # inputs at runtime, then we are forced to clone - which is bad for
+        # both perf and memory usage.
+        #
+        # One option would be to guard on storage_offset%ALIGNMENT, and then
+        # codegen based on this. But storage_offset guards turned out to be
+        # expensive and cause recompiles; Instead, we're generating code
+        # based on the alignment of the example input without guarding.
+        with maybe_get_suppress_shape_guards_ctx():
+            if should_assume_input_aligned(example):
+                self.aligned_inputs.add(target)
+        return tensor
+    def call_function(self, target: Callable, args: Any, kwargs: Dict[str, Any]) -> Any:  # type: ignore[type-arg, override]
+        if target is operator.getitem and isinstance(args[0], (list, tuple, dict)):
+            return super().call_function(target, args, kwargs)
+        # hasattr on OpOverloadPacket is slow, check isinstance first
+        if not isinstance(target, torch._ops.OpOverloadPacket) and hasattr(
+            target, "_inductor_lowering_function"
+        ):
+            # passthrough lowerings from .pattern_matcher
+            return target(*args, **kwargs)
+        if target not in lowerings:
+            assert isinstance(
+                target, torch._ops.OpOverload
+            ), f"{target} is not an OpOverload"
+            base_name = target.name().split(".")[0]
+            if base_name in FALLBACK_ALLOW_LIST:
+                make_fallback(target)
+            elif config.implicit_fallbacks:
+                error = (
+                    MissingOperatorWithDecomp
+                    if get_decompositions([target])
+                    else MissingOperatorWithoutDecomp
+                )
+                log.info(
+                    "Creating implicit fallback for:\n%s",
+                    error.operator_str(target, args, kwargs),
+                )
+                make_fallback(target)
+            elif get_decompositions([target]):
+                # There isn't a good way to dynamically patch this in
+                # since AOT Autograd already ran.  The error message tells
+                # the user how to fix it.
+                raise MissingOperatorWithDecomp(target, args, kwargs)
+            else:
+                raise MissingOperatorWithoutDecomp(target, args, kwargs)
+        try:
+            log.debug("  via %s", lowerings[target])  # type: ignore[index]
+            out = lowerings[target](*args, **kwargs)  # type: ignore[index]
+            return out
+        except Exception as e:
+            raise LoweringException(e, target, args, kwargs).with_traceback(
+                e.__traceback__
+            ) from None
+    @staticmethod
+    def can_inline_constant(t: torch.Tensor) -> bool:
+        """
+        True if this is a small constant attr that will be inlined.
+        """
+        return len(t.shape) == 1 and t.shape[0] <= 8
+    def get_attr(
+        self, target: str, args: Tuple[()], kwargs: Dict[str, object]  # type: ignore[override]
+    ) -> Union[Constant, TensorBox, ir.Subgraph, TorchBindObject]:
+        # this is a constant
+        value = getattr_recursive(self.module, target)  # type: ignore[arg-type]
+        if isinstance(value, torch.fx.GraphModule):
+            return ir.Subgraph(name=target, graph_module=value)
+        if isinstance(value, torch._C.ScriptObject):
+            self.torchbind_constants[target] = value
+            self.constant_reprs[target] = ""
+            return TorchBindObject(target, value)
+        assert isinstance(value, torch.Tensor)
+        if (
+            config.aot_inductor.use_runtime_constant_folding
+            or config.always_keep_tensor_constants
+            or unsupported_output_tensor(value)
+        ):
+            return self.add_tensor_constant(value, target)
+        with no_dispatch():
+            if value.shape == ():
+                return Constant(value.item(), value.dtype, value.device)
+            if self.can_inline_constant(value):
+                log.debug("Inlining constant: %s ", str(target))
+                # tensor lowering has constant inlining logic
+                from .lowering import tensor
+                return tensor(value.tolist(), dtype=value.dtype, device=value.device)
+        return self.add_tensor_constant(value, target)
+    def call_module(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
+        raise AssertionError
+    def call_method(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
+        raise AssertionError
+    def output(
+        self, target: str, args: Tuple[object], kwargs: Dict[str, object]  # type: ignore[override]
+    ) -> None:
+        result = super().output(target, args, kwargs)  # type: ignore[arg-type]
+        if not isinstance(result, (tuple, list)):
+            # nested subgraphs can have singleton outputs
+            result = (result,)
+        assert isinstance(result, (tuple, list)), type(result)
+        assert all(
+            isinstance(
+                x,
+                (
+                    TensorBox,
+                    ir.Constant,
+                    type(None),
+                    ir.ConstantBuffer,
+                    sympy.Expr,
+                    sympy.logic.boolalg.Boolean,
+                    int,
+                    ir.EffectfulKernel,
+                ),
+            )
+            for x in result
+        ), result
+        fx_node_args = V.graph.current_node.args[0]  # type: ignore[arg-type]
+        if not isinstance(fx_node_args, (tuple, list)):
+            # nested subgraphs can have singleton outputs
+            fx_node_args = (fx_node_args,)
+        result = [ir.ExternKernel.realize_input(x) for x in result]
+        result_correct_strides = []
+        assert len(fx_node_args) == len(result)
+        for r, fx_node in zip(result, fx_node_args):
+            if not isinstance(r, (ir.TensorBox, ir.BaseView)):
+                result_correct_strides.append(r)
+            else:
+                # AOT Autograd tries to detect stride divergence of inductor from output metadata.
+                # Here, we try to avoid spurious divergence by matching insignificant strides such as
+                result_correct_strides.append(
+                    self.try_match_insignificant_strides(
+                        r, fx_node.meta["val"].stride()
+                    )
+                )
+        self.graph_outputs = result_correct_strides
+        value: ir.IRNode
+        for name, value in self.graph_inputs.items():
+            assert isinstance(
+                value, (TensorBox, sympy.Expr)
+            ), f"Unsupported inductor graph input type: {type(value)}"
+            if not isinstance(value, TensorBox):
+                continue
+            value.realize()
+            assert isinstance(value, TensorBox)
+            value = value.data
+            assert isinstance(value, ir.StorageBox)
+            value_storage_box = value
+            value = value.data
+            if not isinstance(value, InputBuffer) or value.get_name() != name:
+                # one of our inputs was mutated, need to turn that into a copy
+                ir.MutationLayoutSHOULDREMOVE.realize_into(
+                    value, self.graph_inputs_original[name]
+                )
+                # replace output with mutated input
+                try:
+                    ind = self.graph_outputs.index(value_storage_box)
+                    self.graph_outputs[ind] = self.graph_inputs_original[name]
+                except ValueError:
+                    pass
+        self.finalize()
+        log.debug(
+            "Force channels last inputs for %d conv for the current graph with id %d",
+            self.num_channels_last_conv,
+            self.graph_id if self.graph_id is not None else -1,
+        )
+    def finalize(self) -> None:
+        for buf in self.buffers:
+            buf.decide_layout()
+    @contextmanager
+    def set_current_node(self, node: torch.fx.Node):  # type: ignore[no-untyped-def]
+        old = self.current_node
+        try:
+            self.current_node = node
+            yield
+        finally:
+            self.current_node = old
+    def try_match_insignificant_strides(
+        self,
+        tensor: Union[ir.TensorBox, ir.BaseView],
+        meta_strides_inp: Tuple[Union[int, torch.SymInt], ...],
+    ) -> Union[ir.TensorBox, ir.BaseView]:
+        """
+        Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
+        dimensions - size 0 or 1 - will be updated.
+        If there are real stride differences (NHWC vs NCHW) then the input will be returned.
+        """
+        # should have already been realized
+        assert torch._inductor.ir.is_storage_and_layout(tensor)
+        meta_strides = [
+            s.node.expr if isinstance(s, torch.SymInt) else s for s in meta_strides_inp
+        ]
+        if all(
+            self.sizevars.statically_known_equals(s1, s2)
+            for s1, s2 in zip(meta_strides, tensor.get_stride())
+        ):
+            return tensor  # type: ignore[arg-type]
+        def significant_strides_equal(
+            shape: Sequence[Union[Expr, int]],
+            meta_strides: Sequence[Union[Expr, int]],
+            tensor_strides: Sequence[Union[Expr, int]],
+        ) -> bool:
+            for dim, s1, s2 in zip(shape, meta_strides, tensor_strides):
+                if self.sizevars.statically_known_leq(dim, 1):  # type: ignore[arg-type]
+                    continue
+                if not self.sizevars.statically_known_equals(s1, s2):
+                    return False
+            return True
+        if not significant_strides_equal(
+            tensor.get_size(), meta_strides, tensor.get_stride()
+        ):
+            return tensor
+        storage, old_layout = torch._inductor.ir.as_storage_and_layout(tensor)
+        new_stride = list(old_layout.stride)
+        for i, s in enumerate(tensor.get_size()):
+            if self.sizevars.statically_known_leq(s, 1):  # type: ignore[arg-type]
+                new_stride[i] = meta_strides[i]
+        new_layout = torch._inductor.ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            old_layout.size,
+            new_stride,
+            old_layout.offset,
+        )
+        return ir.TensorBox(torch._inductor.ir.ReinterpretView(storage, new_layout))
+    def propagate_mutation(
+        self,
+        fx_node: torch.fx.Node,
+        old_args: Tuple[Any],
+        old_kwargs: Dict[str, Any],
+        new_args: Tuple[Any],
+        new_kwargs: Dict[str, Any],
+    ) -> None:
+        """Propagate mutations on new_args/new_kwargs back to old_args/old_kwargs.
+        Assumes we may have cloned old_args/old_kwargs into new_args/new_kwargs
+        and then called fx_node(*new_args, **new_kwargs).
+        If fx_node mutates any of new_args/new_kwargs, and they are different from
+        old_args/old_kwargs, then we need to update the original tensor.
+        """
+        assert isinstance(fx_node.target, torch._ops.OpOverload)
+        assert len(old_args) == len(new_args)
+        assert len(old_kwargs) == len(new_kwargs)
+        def maybe_propagate(
+            schema_arg: torch._C.Argument, old_arg: ir.IRNode, new_arg: ir.IRNode
+        ) -> None:
+            if old_arg is new_arg:
+                return
+            if schema_arg.alias_info is not None and schema_arg.alias_info.is_write:
+                # The lowering for copy_ is smart enough to "replace" old_arg with
+                # new_arg in all future uses so a copy_ kernel never gets emitted.
+                self.call_function(torch.ops.aten.copy_.default, (old_arg, new_arg), {})
+        schema = fx_node.target._schema
+        for idx, (old_arg, new_arg) in enumerate(zip(old_args, new_args)):
+            schema_arg = schema.arguments[idx]
+            maybe_propagate(schema_arg, old_arg, new_arg)
+        schema_kwargs = {arg.name: arg for arg in schema.arguments}
+        for key in old_kwargs.keys():
+            old_arg = old_kwargs[key]
+            new_arg = new_kwargs[key]
+            schema_arg = schema_kwargs[key]
+            maybe_propagate(schema_arg, old_arg, new_arg)
+    def run_node(self, n: torch.fx.Node) -> object:
+        def debug(msg: str) -> None:
+            log.debug("lowering %s %s", LazyString(n.format_node), msg)
+        buffer_watermark = len(self.buffers)
+        operation_watermark = len(self.operations)
+        origins = {n}
+        is_call_function = n.op == "call_function"
+        if is_call_function:
+            args, kwargs = self.fetch_args_kwargs_from_env(n)
+            origins |= gather_origins(args, kwargs)
+        with ir.IRNode.current_origins(origins), self.set_current_node(  # type: ignore[arg-type]
+            n
+        ), V.set_current_node(
+            n
+        ):
+            if (
+                n.op == "call_function"
+                and n.target is not operator.getitem
+                and fallback_node_due_to_unsupported_type(n)
+            ):
+                debug("fallback_handler")
+                result = fallback_handler(n.target, add_to_fallback_set=False)(
+                    *args, **kwargs  # type: ignore[possibly-undefined]
+                )
+            elif n.op == "call_function" and (
+                layout_constraints := maybe_layout_constraints(n.target)  # type: ignore[arg-type]
+            ):
+                debug("layout_constraints")
+                old_args = args  # type: ignore[possibly-undefined]
+                old_kwargs = kwargs  # type: ignore[possibly-undefined]
+                args, kwargs = layout_constraints(n, *args, **kwargs)  # type: ignore[index]
+                result = self.call_function(n.target, args, kwargs)  # type: ignore[arg-type]
+                # layout_constraints are allowed to make new copies of the inputs.
+                # if they do, and if the target is mutable, then we need to
+                # write the new values back into the original inputs.
+                self.propagate_mutation(n, old_args, old_kwargs, args, kwargs)  # type: ignore[possibly-undefined]
+            elif is_magic_method(n.target):
+                # TODO: this is sus, it probably should be handled in the
+                # lowerings themselves similarly to sym_size/sym-stride
+                # https://github.com/pytorch/pytorch/issues/127789
+                debug("is_magic_method")
+                if isinstance(
+                    n.meta["val"], (torch.SymInt, torch.SymFloat, torch.SymBool)
+                ):
+                    result = n.meta["val"].node.expr
+                else:
+                    result = super().run_node(n)
+            else:
+                debug("")
+                result = super().run_node(n)
+            # require the same stride order for dense outputs,
+            # 1. user-land view() will not throw because inductor
+            # output different strides than eager
+            # long term the solution is to make view() always succeed
+            # with infallible strides.
+            # 2: as_strided ops, we need make sure its input has same size/stride with
+            # eager model to align with eager behavior.
+            as_strided_ops = [
+                torch.ops.aten.as_strided.default,
+                torch.ops.aten.as_strided_.default,
+                torch.ops.aten.as_strided_scatter.default,
+                torch.ops.aten.resize.default,
+                torch.ops.aten.resize_as.default,
+            ]
+            is_output = any(user.op == "output" for user in n.users)
+            is_input_for_as_strided = any(
+                user.target in as_strided_ops for user in n.users
+            )
+            if n.meta.get("inductor_realize_to_strides", False) and isinstance(
+                result, TensorBox
+            ):
+                result.realize()
+                strides = n.meta["val"].stride()
+                sym_strides = torch._inductor.utils.any_is_symbolic(*strides)
+                if (
+                    not hasattr(result, "get_stride")
+                    or result.get_stride() != strides
+                    and not sym_strides
+                ):
+                    stride_order = ir.get_stride_order(strides)
+                    result = ir.ExternKernel.require_stride_order(result, stride_order)
+            if (
+                is_output
+                and isinstance(result, TensorBox)
+                and isinstance(result.data, ir.BaseView)
+            ):
+                # Realize so that outputs are correctly aliased
+                result.realize()
+            if (is_output or is_input_for_as_strided) and isinstance(
+                n.meta["val"], torch.Tensor
+            ):
+                strides = n.meta["val"].stride()
+                if len(strides):
+                    allow_padding = (
+                        config.pad_outputs or n.name not in self.user_visible_outputs
+                    ) and not is_input_for_as_strided
+                    dense = torch._prims_common.is_non_overlapping_and_dense(
+                        n.meta["val"]
+                    )
+                    unbacked_symbols_in_strides = (
+                        len(free_unbacked_symbols(strides)) > 0
+                    )
+                    if (
+                        not unbacked_symbols_in_strides
+                        and dense
+                        and len(result.get_size()) == 4
+                        and n in self.nodes_prefer_channels_last
+                        and n.name not in self.user_visible_outputs
+                        and not is_input_for_as_strided
+                    ):
+                        strides = ir.FlexibleLayout.stride_ordered_for_memory_format(
+                            result.get_size(), torch.channels_last
+                        )
+                    if not unbacked_symbols_in_strides and len(strides):
+                        # To avoid converting possible view ops to a copy kernel, we use the previous
+                        # require_exact_strides to handle views. But ultimately it's better to require
+                        # the right strides at the tensor definition.
+                        if n.meta["val"]._is_view() or isinstance(
+                            result.data, ir.BaseView
+                        ):
+                            result = ir.ExternKernel.require_stride_order(
+                                result,
+                                ir.get_stride_order(strides),
+                                allow_padding=allow_padding,
+                            )
+                        else:
+                            strides = [
+                                s.node.expr if isinstance(s, torch.SymInt) else s
+                                for s in strides
+                            ]
+                            result = ir.ExternKernel.require_exact_strides(
+                                result, strides, allow_padding=allow_padding
+                            )
+            # Realize if (1) any user need inputs realized, or (2) there is
+            # already too many reads and rematerializing can be bad.
+            num_users = len(OrderedSet(n.users))
+            if num_users > 1 and isinstance(result, TensorBox):
+                for user in n.users:
+                    if user.target in needs_realized_inputs:
+                        result.realize_hint()
+                        # This inclusion is somewhat controversial (from
+                        # discussion between Horace, Natalia, and Elias).
+                        # Currently, it's not very clear why this is helpful.
+                        # The general idea here is that even though a node may
+                        # have FlexibleLayout, we still often *treat* it as if
+                        # it was contiguous. This appears to sometimes result in
+                        # suboptimal behavior.
+                        #
+                        # When we do a better job selecting layout, we should
+                        # revisit this.
+                        need_fixed_layout = [
+                            torch.ops.aten.convolution_backward.default,
+                            torch.ops.aten.mm.default,
+                            torch.ops.aten._int_mm.default,
+                        ]
+                        need_fixed_channels_last_layout = []
+                        if not self.layout_opt:
+                            need_fixed_layout.append(torch.ops.aten.convolution.default)
+                        if torch._C._has_mkldnn:
+                            need_fixed_layout += [
+                                torch.ops.mkldnn._linear_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.binary,
+                                torch.ops.aten.mkldnn_rnn_layer.default,
+                                torch.ops.onednn.qlinear_pointwise.default,
+                                torch.ops.onednn.qlinear_pointwise.tensor,
+                                torch.ops.onednn.qlinear_pointwise.binary,
+                                torch.ops.onednn.qlinear_pointwise.binary_tensor,
+                            ]
+                            need_fixed_channels_last_layout += [
+                                torch.ops.mkldnn._convolution_pointwise.default,
+                                torch.ops.mkldnn._convolution_pointwise.binary,
+                                torch.ops.mkldnn._convolution_pointwise_.binary,
+                                torch.ops.mkldnn._convolution_transpose_pointwise.default,
+                                torch.ops.onednn.qconv2d_pointwise.default,
+                                torch.ops.onednn.qconv2d_pointwise.binary,
+                            ]
+                            if torch._C.has_mkl:
+                                need_fixed_layout += [torch.ops.mkl._mkl_linear.default]
+                        if user.target in need_fixed_layout:
+                            result = ir.ExternKernel.require_stride_order(
+                                result,
+                                ir.get_stride_order(n.meta["val"].stride()),
+                                allow_padding=True,
+                            )
+                        if (
+                            user.target in need_fixed_channels_last_layout
+                            and n is user.args[0]
+                        ):
+                            result = ir.ExternKernel.require_stride_order(
+                                result,
+                                ir.get_stride_order(
+                                    make_channels_last_strides_for(n.meta["val"].shape)
+                                ),
+                            )
+                    if user.op == "output":
+                        if isinstance(result.data.data, (Pointwise, Reduction)):
+                            result.realize()
+                # TODO(jansel): introduce a store vs inline choice
+                result.mark_reuse(len(n.users))
+            # Realize if the IRNode already has accumulated lots of reads
+            if isinstance(result, TensorBox) and result.has_exceeded_max_reads():
+                # Prevent excessive accumulation in a computed buffer, when
+                # there are multiple branches each with small number of memory
+                # reads, but they converge to a user.
+                result.realize_hint()
+            # Realize if a Pointwise has too much stuff to be inlined.
+            # As this may cause RecursionError during Inductor's evaluation.
+            if isinstance(result, TensorBox) and isinstance(result.data, StorageBox):
+                curr = result.data.data
+                if isinstance(curr, Pointwise):
+                    # Use inner fn as a rough proxy. Good enough.
+                    if curr.has_large_inner_fn():
+                        result.realize()
+        # This is not complete, but it doesn't have to be: origin_node
+        # tracking is best effort.  The logic here critically relies on direct
+        # TensorBox -> StorageBox denoting a non-view; we don't bother trying
+        # to get views to work.  Feel free to add any extra cases as needed.
+        #
+        # Note: we can't YOLO tree_map over this result, because if there are
+        # buffers or a view involved, we might not be able to validly assign
+        # the origin_node here.
+        if isinstance(result, TensorBox) and isinstance(result.data, ir.StorageBox):
+            if isinstance(result.data.data, ir.Loops):
+                result.data.data.origin_node = n
+            elif isinstance(result.data.data, ir.Buffer):
+                result.data.data.origin_node = n
+                if isinstance(result.data.data, ir.ComputedBuffer) and isinstance(
+                    result.data.data.data, ir.Loops
+                ):
+                    result.data.data.data.origin_node = n
+                # Not really multi-output, can straightforwardly recurse in
+                elif (
+                    isinstance(result.data.data, ir.MultiOutput)
+                    and not result.data.data.indices
+                ):
+                    if isinstance(result.data.data.inputs[0], ir.Buffer):
+                        result.data.data.inputs[0].origin_node = n
+        self.register_users_of(result)
+        new_unbacked_defs: OrderedSet[sympy.Symbol] = OrderedSet()
+        for buf in self.buffers[buffer_watermark:]:
+            new_unbacked_defs |= buf.get_unbacked_symbol_defs()
+        for op in self.operations[operation_watermark:]:
+            new_unbacked_defs |= op.get_unbacked_symbol_defs()
+        def format_new_defs() -> str:
+            r = []
+            for buf in self.buffers[buffer_watermark:]:
+                r.append(
+                    f"unbacked_symbol_defs={buf.get_unbacked_symbol_defs()} in:\n{buf}\n"
+                )
+            for op in self.operations[operation_watermark:]:
+                r.append(
+                    f"unbacked_symbol_defs={op.get_unbacked_symbol_defs()} in:\n{op}\n"
+                )
+            return "***\n".join(r)
+        if n.op != "placeholder":
+            # Note [Backwards runtime asserts]
+            # Backwards poses an interesting problem for deferred runtime
+            # asserts.  In the easy case, we may solely close over data
+            # dependent sized tensors, and there are no binding sites for
+            # unbacked SymInts.  In this case, we can just drop all the
+            # runtime asserts on the floor: no non-placeholder bindings, no
+            # problem.
+            #
+            # However, it is *possible* for a fresh runtime assert to show up
+            # between forwards and backwards.  Right now, the freezing process
+            # that happens when we lower forwards means that we will freeze
+            # runtime asserts, and then the moment the backwards lowering
+            # process attempts to add a new deferred runtime assert, we will
+            # fail.  Let's say you remove that assert.  Now when we get here,
+            # we need to make sure we actually emit these asserts (because we
+            # can't emit them in forwards, we already compiled it).  So we
+            # have to do something here.  But we don't want to reemit ALL
+            # deferred runtime asserts, we only want to emit the NEW ones.
+            # Therefore needing some sort of stratification in the ShapeEnv.
+            # This is all doable, it just hasn't been done yet.
+            shape_env = V.graph.sizevars.shape_env
+            def make_assert(expr: Expr, msg: str) -> None:
+                assert_op = ir.AssertScalar(expr, msg)
+                self.register_buffer(assert_op, set_name=True)
+                self.register_operation(assert_op)
+            for i0 in new_unbacked_defs:
+                ras = self.ras_by_symbol.pop(i0, [])
+                # NB: size-like not needed, we won't retrace
+                vr = shape_env.var_to_range[i0]
+                if not shape_env._default_unspecified_value_range().issubset(vr):
+                    def is_convertible(s: Expr) -> bool:
+                        if s in (int_oo, -int_oo):
+                            return False
+                        try:
+                            int(s)
+                            return True
+                        except TypeError:
+                            return False
+                    if is_convertible(vr.lower):
+                        make_assert(i0 >= vr.lower, f"{i0} >= {vr.lower}")
+                    if is_convertible(vr.upper):
+                        make_assert(i0 <= vr.upper, f"{i0} <= {vr.upper}")
+                for ra in ras:
+                    fvs = free_unbacked_symbols(ra.expr)
+                    missing = fvs - self.bound_unbacked_symbols
+                    if missing:
+                        i1 = min(missing, key=str)
+                        self.ras_by_symbol.setdefault(i1, []).append(ra)
+                    else:
+                        make_assert(ra.expr, f"{ra.expr}")
+            self.bound_unbacked_symbols |= new_unbacked_defs
+            unbacked_bindings = resolve_unbacked_bindings(
+                V.graph.sizevars.shape_env, n.meta.get("unbacked_bindings", {})
+            )
+            # When we do lowering, it is possible we reallocate unbacked SymInts.
+            # So we need to line up the unbacked SymInts when performing the test
+            # here
+            #
+            # In principle, we could permit lowering to introduce MORE unbacked
+            # SymInts: as long as all the old unbacked ones are accounted for,
+            # it's fine for inductor to introduce extra calls to item()/unbacked()
+            # whatever.  This actually happens in practice when an unbacked SymInt
+            # gets memoized away; naively, when Inductor reprocesses a kernel, it
+            # doesn't know that the memo still applies, and ends up allocating a
+            # new symbol.  However, this is generally a bad thing: we may still
+            # end up needing to test equalities on the symbols, and a fresh
+            # symbol is likely to hit lots of GuardOnDataDependent errors that
+            # we already know facts for.
+            renamed_unbacked_bindings = OrderedSet(
+                V.fake_mode.shape_env.unbacked_renamings.get(s, s)
+                for s in unbacked_bindings.keys()
+            )
+            assert new_unbacked_defs >= renamed_unbacked_bindings, (
+                f"failed {new_unbacked_defs} >= {renamed_unbacked_bindings} (inductor >= fx)\n"
+                f"fx node is: {n.format_node()}\n"
+                f"new operations are:\n\n{format_new_defs()}"
+            )
+        return result
+    def validate_can_generate_cpp_wrapper(self) -> None:
+        if config.disable_cpp_codegen:
+            raise CppWrapperCodeGenError("C++ codegen is disabled")
+        if sys.platform not in ["linux", "darwin", "win32"]:
+            raise CppWrapperCodeGenError(f"Unsupported platform {sys.platform}")
+        for value in self.graph_inputs.values():
+            dtype = None
+            if isinstance(value, TensorBox):
+                dtype = value.get_dtype()
+            elif isinstance(
+                value, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+            ):
+                dtype = may_get_constant_buffer_dtype(value)
+            if not supported_dtype_of_cpp_wrapper(dtype, self.cuda):
+                raise CppWrapperCodeGenError(f"Unsupported input dtype {dtype}")
+    def init_wrapper_code(self) -> None:
+        self.cuda = "cuda" in self.device_types
+        if self.cpp_wrapper:
+            self.validate_can_generate_cpp_wrapper()
+        device_types = self.device_types.copy()
+        device_types.discard("cpu")
+        device_types.discard("meta")
+        # TODO(Eikan): Only support mixing cpu and other device now.
+        assert len(device_types) <= 1, "Does not support mixing {}".format(
+            "+".join(device_types)
+        )
+        only_cpu = len(device_types) == 0
+        device_type = "cpu" if only_cpu else device_types.pop()
+        self.device_ops = get_device_op_overrides(device_type)
+        wrapper_code_gen_cls = get_wrapper_codegen_for_device(
+            device_type, self.cpp_wrapper
+        )
+        assert wrapper_code_gen_cls is not None, f"Device {device_type} not supported"
+        self.wrapper_code = wrapper_code_gen_cls()
+        if self.const_module:
+            # If we have const module, we could reuse the kernels
+            # This could avoid duplication and save time on doing recompilation (if Triton.)
+            self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
+            self.wrapper_code.src_to_kernel = (
+                self.const_module.wrapper_code.src_to_kernel
+            )
+    def codegen_with_cpp_wrapper(self) -> Tuple[str, List[Tuple[int, Node]]]:
+        """
+        For CPU, the cpp wrapper codegen is done in one pass.
+        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
+        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
+        generate cpp wrapper code and compile it to a dynamic library in the second pass.
+        """
+        if "cuda" in self.device_types:
+            # first pass
+            self.cpp_wrapper = False
+            # Although triton.store_cubin was OrderedSet in compile_fx, the backward pass didn't pick
+            # that up. In theory it should work by only setting triton.store_cubin to True here,
+            # but that will cause a problem when use_runtime_constant_folding is OrderedSet.
+            with config.patch({"triton.store_cubin": True}):
+                compiled = self.compile_to_module().call
+            if not config.triton.autotune_at_compile_time:
+                def materialize(
+                    x: Union[torch.SymInt, torch.SymFloat, torch.Tensor]
+                ) -> Union[int, float, torch.Tensor]:
+                    if x is None:
+                        return None
+                    elif isinstance(x, (torch.SymInt, torch.SymFloat)):
+                        # Need concrete value to run dynamic shapes and tune the result
+                        return x.node.hint
+                    elif isinstance(x, FakeTensor):
+                        return defake(x)
+                    else:
+                        assert isinstance(
+                            x, torch.Tensor
+                        ), "Unknown type when creating real inputs" + str(type(x))
+                        return x
+                tracing_context = torch._guards.TracingContext.try_get()
+                if tracing_context is not None and not isinstance(
+                    V.real_inputs, NullHandler
+                ):
+                    if tracing_context.output_strides:
+                        tracing_context.output_strides.clear()
+                    params_flat = [
+                        param
+                        for param in tracing_context.params_flat  # type: ignore[union-attr]
+                        if param is not None
+                    ]
+                    real_inputs = [
+                        materialize(x)
+                        for x in itertools.chain(params_flat, V.real_inputs)
+                    ]
+                else:
+                    # In the backward pass, V.real_inputs is not OrderedSet.
+                    # Generating random inputs based on self.example_inputs sometimes can be problematic,
+                    # e.g. illegal memory access. A comprehensive fix is to autotune in a separate process.
+                    real_inputs = [
+                        materialize(x)
+                        for x in (
+                            self.example_inputs
+                            if isinstance(V.real_inputs, NullHandler)
+                            else V.real_inputs
+                        )
+                    ]
+                if self.mutated_inputs:
+                    from .compile_fx import clone_preserve_strides
+                    mutated_input_idxs = [
+                        idx
+                        for idx, name in enumerate(self.graph_inputs)
+                        if name in self.mutated_inputs
+                        and isinstance(real_inputs[idx], torch.Tensor)
+                    ]
+                    for idx in mutated_input_idxs:
+                        # clone mutated Tensor inputs to avoid mutating them in
+                        # the first pass of the CPP wrapper-based compilation, as
+                        # this will lead to a side effect on the example inputs:
+                        # e.g. if torch.compile(f)(x) if called on input-mutating
+                        # f, the inputs x will be mutated twice in the process:
+                        # once here, and again when running the compiled model;
+                        # this will also lead to a numerically incorrect output
+                        mutated_inp = real_inputs[idx]
+                        assert isinstance(mutated_inp, torch.Tensor)
+                        real_inputs[idx] = clone_preserve_strides(mutated_inp)
+                        del mutated_inp
+                with torch.utils._python_dispatch._disable_current_modes():
+                    compiled(real_inputs)
+                del real_inputs
+            # second pass
+            self.cpp_wrapper = True
+            self.removed_buffers.clear()
+            self.removed_operations.clear()
+            self.inplaced_to_remove.clear()
+            V.graph.sizevars.precomputed_replacements.clear()
+            V.graph.sizevars.inv_precomputed_replacements.clear()
+            with config.patch({"triton.autotune_at_compile_time": False}):
+                return self.codegen()
+        else:
+            # cpu
+            return self.codegen()
+    def codegen(self) -> Tuple[str, List[Tuple[int, Node]]]:
+        from .scheduler import Scheduler
+        self.init_wrapper_code()
+        self.scheduler = Scheduler(self.operations)
+        V.debug.draw_orig_fx_graph(self.orig_gm, self.scheduler.nodes)
+        self.wrapper_code.push_codegened_graph(self)
+        self.scheduler.codegen()
+        log.debug(
+            "Finished codegen for all nodes. The list of kernel names available: %s",
+            V.graph.all_codegen_kernel_names,
+        )
+        result = self.wrapper_code.generate(self.is_inference)
+        self.wrapper_code.pop_codegened_graph()
+        return result
+    def codegen_subgraph(self, parent_graph: "GraphLowering") -> None:
+        """
+        This is a more compact version of the `codegen()` above
+        where we codegen this graph as a subgraph of some parent
+        graph. The parent graph is passed as an argument: the
+        intention is to inline codegening of the subgraph in
+        the parent graph's wrapper code (including the generated
+        kerenls). The wrapper code is not finalized (via `.generate()`
+        call), as this will be done in the parent graph's `codegen()`.
+        """
+        from .scheduler import Scheduler
+        self.wrapper_code = parent_graph.wrapper_code
+        self.device_ops = parent_graph.device_ops
+        self.cpp_wrapper = parent_graph.cpp_wrapper
+        self.scheduler = Scheduler(self.operations)
+        self.scheduler.codegen()
+    def count_bytes(
+        self,
+    ) -> Tuple[
+        int, List[Tuple[BaseSchedulerNode, int]], List[Tuple[BaseSchedulerNode, float]]
+    ]:
+        total_bytes = 0
+        node_counts = []
+        node_runtimes = []
+        for node in self.scheduler.nodes:
+            num_bytes = node.get_read_write_buffers_sizes()
+            total_bytes += num_bytes
+            node_counts.append((node, num_bytes // 4))
+            node_runtimes.append((node, node.get_estimated_runtime()))
+        return total_bytes, node_counts, node_runtimes
+    @staticmethod
+    def save_output_code(code: str) -> None:
+        # No-op to be patched for unit tests
+        pass
+    def compile_to_module(self) -> ModuleType:
+        with dynamo_timed(
+            "GraphLowering.compile_to_module", phase_name="code_gen", fwd_only=False
+        ):
+            return self._compile_to_module()
+    def _compile_to_module(self) -> ModuleType:
+        from .codecache import PyCodeCache
+        code, linemap = (
+            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+        )
+        GraphLowering.save_output_code(code)
+        output_code_log.debug("Output code: \n%s", code)
+        try:
+            linemap = [(line_no, node.stack_trace) for line_no, node in linemap]  # type: ignore[misc]
+            key, path = PyCodeCache.write(code)
+        except Exception:
+            trace_structured(
+                "inductor_output_code",
+                # Just omit the filename, I still want the code though!
+                payload_fn=lambda: code,
+            )
+            raise
+        else:
+            trace_structured(
+                "inductor_output_code",
+                lambda: {"filename": path},
+                payload_fn=lambda: code,
+            )
+        mod = PyCodeCache.load_by_key_path(
+            key,
+            path,
+            linemap=linemap,  # type: ignore[arg-type]
+            attrs={**self.constants, **self.torchbind_constants},
+        )
+        self.cache_key = key
+        self.cache_path = path
+        self.cache_linemap = linemap  # type: ignore[assignment]
+        # Logged twice as per https://github.com/pytorch/pytorch/pull/99038#discussion_r1167826029
+        # TODO. Revisit this once the logging API is more mature
+        assert mod.__file__ is not None
+        log_module_code(mod.__file__)
+        log.debug("Output code written to: %s", mod.__file__)
+        output_code_log.info("Output code written to: %s", mod.__file__)
+        if config.benchmark_kernel:
+            print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
+        V.debug.output_code(mod.__file__)
+        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+        return mod
+    def compile_to_fn(self) -> Any:
+        if self.aot_mode:
+            from .codecache import AotCodeCompiler
+            assert self.cpp_wrapper, "AOT mode only supports C++ wrapper"
+            code, linemap = self.codegen_with_cpp_wrapper()
+            output_code_log.debug("Output code: \n%s", code)
+            serialized_extern_kernel_nodes = None
+            if self.extern_kernel_nodes:
+                serialized_extern_kernel_nodes = self.extern_node_serializer(
+                    self.extern_kernel_nodes
+                )
+                output_code_log.debug(
+                    "Serialized Extern Kernel Nodes: \n%s",
+                    serialized_extern_kernel_nodes,
+                )
+            # Directly return the file path with the compiled code
+            return AotCodeCompiler.compile(
+                self, code, serialized_extern_kernel_nodes, cuda=self.cuda
+            )
+        else:
+            return self.compile_to_module().call
+    def get_output_names(self) -> List[str]:
+        return [
+            node.get_name()
+            for node in self.graph_outputs
+            if not isinstance(node, ir.NoneAsConstantBuffer)
+            and not isinstance(node, ir.ShapeAsConstantBuffer)
+        ]
+    def is_unspec_arg(self, name: str) -> bool:
+        # dynamo wraps unspec variable as 0d CPU tensor,
+        # need to convert to scalar during codegen (triton only)
+        return (
+            name in self.graph_inputs.keys()
+            and self.graph_inputs[name].get_numel() == 1
+            and self.graph_inputs[name].get_device().type == "cpu"
+        ) or name in self.zero_dim_cpu_tensor_list

.venv/lib/python3.11/site-packages/torch/_inductor/hooks.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# mypy: allow-untyped-defs
+import contextlib
+from typing import Callable, List, TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch
+# Executed in the order they're registered
+INTERMEDIATE_HOOKS: List[Callable[[str, "torch.Tensor"], None]] = []
+@contextlib.contextmanager
+def intermediate_hook(fn):
+    INTERMEDIATE_HOOKS.append(fn)
+    try:
+        yield
+    finally:
+        INTERMEDIATE_HOOKS.pop()
+def run_intermediate_hooks(name, val):
+    global INTERMEDIATE_HOOKS
+    hooks = INTERMEDIATE_HOOKS
+    INTERMEDIATE_HOOKS = []
+    try:
+        for hook in hooks:
+            hook(name, val)
+    finally:
+        INTERMEDIATE_HOOKS = hooks

.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# mypy: allow-untyped-defs
+"""This file implements the IndexPropagation ops handler, which wraps an
+underlying handler to add a limited form of constant propagation, as well as
+propagation of sympy expressions downstream of ops.index_expr calls.
+For example, say we have the IR:
+   tmp0 = ops.index_expr(x, torch.int32)
+   tmp1 = ops.constant(2, torch.int32)
+   tmp2 = ops.mul(tmp0, tmp1)
+   tmp3 = ops.indirect_indexing(tmp2, x_size)
+   tmp4 = ops.load("buf0", tmp3)
+The underlying handler would just see:
+   ops.load("buf0", x * 2)
+This is limited by the set of operators handled in the sympy expression
+printers. So simple operations like minimum and maximum cannot be translated to
+SymPy expressions yet, despite sympy.Min and sympy.Max existing.
+"""
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Literal, Optional, overload, Tuple, Union
+from typing_extensions import TypeAlias
+import sympy
+import torch
+from torch._prims_common import dtype_to_type, is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
+from .sizevars import evaluate_expr
+from .utils import generate_assert
+from .virtualized import V
+_ExprType = Union[sympy.Expr, float, int, bool]
+def _is_constant(val: _ExprType):
+    if isinstance(val, sympy.Basic):
+        return val.is_number
+    return isinstance(val, (int, float, bool))
+def upper_bound(val: _ExprType):
+    return bound_sympy(val).upper if isinstance(val, sympy.Expr) else val
+@dataclass
+class TypedExpr:
+    """A SymPy expression with associated type"""
+    expr: _ExprType
+    dtype: torch.dtype
+    def is_constant(self):
+        return _is_constant(self.expr)
+    def __post_init__(self):
+        if _is_constant(self.expr):
+            self.expr = dtype_to_type(self.dtype)(self.expr)
+class SymPyOps:
+    """An ops handler where all IR values are SymPy expressions
+    When a value cannot be represented as a SymPy expression, the method is
+    either not defined, or returns NotImplemented
+    """
+    @staticmethod
+    def identity(value: Any) -> Any:
+        return value
+    @staticmethod
+    def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr:
+        return TypedExpr(value, dtype)
+    @staticmethod
+    def index_expr(value: Union[sympy.Expr, int], dtype: torch.dtype) -> TypedExpr:
+        return TypedExpr(value, dtype)
+    @staticmethod
+    def to_dtype(
+        value: TypedExpr,
+        dtype: torch.dtype,
+        src_dtype: Optional[torch.dtype] = None,
+        use_compute_types: bool = False,
+    ) -> TypedExpr:
+        return TypedExpr(value.expr, dtype)
+    @staticmethod
+    def abs(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(abs(x.expr), x.dtype)  # type: ignore[arg-type]
+    @staticmethod
+    def square(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(x.expr * x.expr, x.dtype)
+    @staticmethod
+    def add(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr + y.expr, result_type)
+    @staticmethod
+    def sub(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr - y.expr, result_type)
+    @staticmethod
+    def mul(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr * y.expr, result_type)
+    @staticmethod
+    def neg(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(-x.expr, x.dtype)
+    @staticmethod
+    def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        return TypedExpr(FloorDiv(x.expr, y.expr), result_type)
+    @staticmethod
+    def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+        return TypedExpr(result_expr, result_type)
+    @staticmethod
+    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        x_expr = sympy.sympify(x.expr)
+        y_expr = sympy.sympify(y.expr)
+        # In these cases, remainder in Python == remainder in C++, so this transformation
+        # is sound
+        if (
+            x_expr.is_nonnegative is not None
+            and x_expr.is_nonnegative == y_expr.is_positive
+        ):
+            result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+            return TypedExpr(result_expr, result_type)
+        return NotImplemented
+    @staticmethod
+    def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Min(x.expr, y.expr), result_type)
+    @staticmethod
+    def maximum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Max(x.expr, y.expr), result_type)
+@dataclass
+class IndexPropVar:
+    value: Any  # Either an IR value, or TypedExpr if is_symbolic is true
+    is_symbolic: bool = False
+    @staticmethod
+    def new_symbolic(expr: TypedExpr) -> "IndexPropVar":
+        return IndexPropVar(expr, is_symbolic=True)
+    def __post_init__(self):
+        assert not self.is_symbolic or isinstance(
+            self.value, TypedExpr
+        ), "Symbolic IndexPropVar must contain a TypedExpr"
+IndexPropResult: TypeAlias = Union[IndexPropVar, Tuple["IndexPropResult", ...]]
+class IndexPropagation:
+    """Ops wrapper that tries to propagate constant and index_expr values through the computation.
+    This aims to maximize the compile time simplification possible, and convert
+    indirect indexing from arange into normal static indexing.
+    """
+    def __init__(
+        self,
+        inner: Any,
+        iter_ranges: Dict[sympy.Symbol, sympy.Expr],
+        indirect_var_ranges: Dict[sympy.Symbol, sympy.Expr],
+    ) -> None:
+        self._inner = inner
+        self.shape_env = V.graph.sizevars.shape_env
+        var_to_range = {
+            k: ValueRanges(0, upper_bound(v) - 1) for k, v in iter_ranges.items()
+        }
+        self.var_to_range = tuple(
+            itertools.chain(self.shape_env.var_to_range.items(), var_to_range.items())
+        )
+        # NOTE: this is intentionally kept as a reference so the caller can
+        # update it in-place
+        self.indirect_var_ranges = indirect_var_ranges
+        axioms = []
+        for x, s in iter_ranges.items():
+            axioms.append(0 <= x)
+            axioms.append(x < s)
+        self.axioms = tuple(axioms) + self.shape_env.get_axioms()
+    def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any:
+        # Construct a new constant/index_expr from the SymPy expression
+        if _is_constant(expr):
+            val = dtype_to_type(dtype)(expr)
+            return self._inner.constant(val, dtype)
+        return self._inner.index_expr(expr, dtype)
+    def unwrap(self, a: Union[Any, IndexPropVar]) -> Any:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.unwrap(v) for v in a)
+        if not isinstance(a, IndexPropVar):
+            return a
+        # Prefer the sympy representation if possible
+        if a.is_symbolic:
+            return self.materialize_expr(a.value.expr, a.value.dtype)
+        return a.value
+    def wrap(self, a) -> IndexPropResult:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.wrap(v) for v in a)
+        return IndexPropVar(a)
+    @overload
+    def fallback(
+        self,
+        name: Literal["indirect_indexing"],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> IndexPropVar:
+        ...
+    @overload
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        ...
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Fallback to the wrapped handler
+        new_args = [self.unwrap(a) for a in args]
+        new_kwargs = {k: self.unwrap(v) for k, v in kwargs.items()}
+        return self.wrap(getattr(self._inner, name)(*new_args, **new_kwargs))
+    def propagate_sympy(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Build a new SymPy expression from this ops call
+        def unwrap(a: Union[Any, IndexPropVar]) -> Any:
+            if not isinstance(a, IndexPropVar):
+                return a
+            return a.value
+        new_args = [unwrap(a) for a in args]
+        new_kwargs = {k: unwrap(v) for k, v in kwargs.items()}
+        new_expr = getattr(SymPyOps, name)(*new_args, **new_kwargs)
+        is_valid_expr = new_expr is not NotImplemented and (
+            # Inductor doesn't expect floating point in sympy expressions, but
+            # allow floating point constants to be propagated
+            new_expr.is_constant()
+            or new_expr.expr.is_integer
+        )
+        if not is_valid_expr:
+            return self.fallback(name, args, kwargs)
+        return IndexPropVar.new_symbolic(new_expr)
+    def __getattr__(self, name: str) -> Callable[..., IndexPropResult]:
+        def inner(*args: Any, **kwargs: Any) -> IndexPropResult:
+            if not hasattr(SymPyOps, name):
+                return self.fallback(name, args, kwargs)
+            var_arguments = [
+                a
+                for a in itertools.chain(args, kwargs.values())
+                if isinstance(a, IndexPropVar)
+            ]
+            if not all(v.is_symbolic for v in var_arguments):
+                return self.fallback(name, args, kwargs)
+            return self.propagate_sympy(name, args, kwargs)
+        return inner
+    def statically_true(self, e):
+        """
+        Given some iter_ranges, return a function that given an expression, returns whether
+        it is true or false using value ranges, guard knowledge and runtime_asserts.
+        FIXME I think this may not be entirely right, as we may not be able to use all runtime_asserts
+              If this is an issue, just use guards in `self.axioms`.
+              The proper way of handling this would be to have a global shape_env that adds
+              runtime_asserts as they happen in the code. Then, it shuld be used in SimplifyIndexing
+              to perform wrap_expr and in CSEProxy.check_bounds to elide upper / lower bounds also
+              for indirect_indexing
+        """
+        var_to_range = (
+            *self.var_to_range,
+            *(
+                (k, ValueRanges(0, upper_bound(v) - 1))
+                for k, v in self.indirect_var_ranges.items()
+            ),
+        )
+        return evaluate_expr(self.shape_env, e, self.axioms, var_to_range)
+    def indirect_indexing(
+        self,
+        index: Union[Any, IndexPropVar],
+        size: Any,
+        check: bool = True,
+        wrap_neg=True,
+    ) -> Any:
+        if isinstance(index, IndexPropVar) and index.is_symbolic:
+            # If we find something we can convert into a direct indexing we do so
+            # We still need to (perhaps) wrap the expression and add bound checks
+            # We want to do this "constant folding", as we don't allow to fuse
+            # kernels into indirect indexing
+            expr = sympy.sympify(index.value.expr)
+            # TODO Perhaps move this logic to the simplify indexing pass
+            def wrap_expr(expr):
+                # Positive, negative, mixed
+                if self.statically_true(0 <= expr):
+                    return expr
+                elif self.statically_true(expr < 0):
+                    return expr + size
+                else:
+                    return Where(expr < 0, expr + size, expr)
+            # Sometimes it's easier to prove 0 <= expr than the weaker -size <= expr
+            can_prove_lower = self.statically_true(0 <= expr) or self.statically_true(
+                -size <= expr
+            )
+            can_prove_upper = self.statically_true(expr < size)
+            if wrap_neg:
+                expr = wrap_expr(expr)
+            if generate_assert(check):
+                self.fallback(
+                    "check_bounds",
+                    (expr, size),
+                    dict(lower=not can_prove_lower, upper=not can_prove_upper),
+                )
+            return expr
+        indirect_var = self.fallback(
+            "indirect_indexing", (index, size, check, wrap_neg), {}
+        ).value
+        return indirect_var

.venv/lib/python3.11/site-packages/torch/_inductor/inductor_prims.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import logging
+from typing import Optional, Sequence
+import torch
+from torch import _prims, Tensor
+log = logging.getLogger(__name__)
+def make_prim(
+    schema: str,
+    impl_aten,
+    return_type=_prims.RETURN_TYPE.NEW,
+    doc: str = "",
+    tags: Optional[Sequence[torch.Tag]] = None,
+):
+    if isinstance(return_type, tuple):
+        def meta(*args, **kwargs):
+            return tuple(_prims.TensorMeta(o) for o in impl_aten(*args, **kwargs))
+    else:
+        def meta(*args, **kwargs):
+            return _prims.TensorMeta(impl_aten(*args, **kwargs))
+    return _prims._make_prim(
+        schema=schema,
+        return_type=return_type,
+        meta=meta,
+        impl_aten=impl_aten,
+        doc=doc,
+        tags=tags,
+    )
+def eager_force_stride(input_tensor: Tensor, stride) -> Tensor:
+    if input_tensor.stride() == stride:
+        return input_tensor
+    new_tensor = input_tensor.clone().as_strided(
+        input_tensor.shape,
+        stride,
+    )
+    new_tensor.copy_(input_tensor)
+    return new_tensor
+# Custom prims used for handling randomness
+seed = make_prim(
+    "inductor_seed(Device device) -> Tensor",
+    lambda device: torch.randint(2**63 - 1, [], device=device),
+    doc="create a fresh seed (one per call) for use with inductor_rand",
+    tags=(torch.Tag.nondeterministic_seeded,),
+)
+seeds = make_prim(
+    "inductor_seeds(int count, Device device) -> Tensor",
+    lambda count, device: torch.randint(2**63 - 1, [count], device=device),
+    doc="Horizontal fusion of many inductor_seed() calls",
+    tags=(torch.Tag.nondeterministic_seeded,),
+)
+lookup_seed = make_prim(
+    # if inductor_lookup_seed changes, update partitioners.py
+    "inductor_lookup_seed(Tensor seeds, int index) -> Tensor",
+    lambda seeds, index: seeds[index],
+    doc="Extract a single seed from the result of inductor_seeds()",
+)
+random = make_prim(
+    "inductor_random(SymInt[] size, Tensor seed, str mode) -> Tensor",
+    lambda size, seed, mode: getattr(torch, mode)(size, device=seed.device),
+    doc="torch.rand()/torch.randn() using backend-specific RNG that can be fused",
+)
+randint = make_prim(
+    "inductor_randint(SymInt low, SymInt high, SymInt[] size, Tensor seed) -> Tensor",
+    lambda low, high, size, seed: torch.randint(low, high, size, device=seed.device),
+    doc="torch.randint() using backend-specific RNG that can be fused",
+)
+force_stride_order = make_prim(
+    "inductor_force_stride_order(Tensor input, SymInt[] stride) -> Tensor",
+    eager_force_stride,
+    doc="Force the stride order for input tensor. No-op if the input tensor already has the stride. Do a copy otherwise",
+)
+_unsafe_index_put_ = make_prim(
+    "_unsafe_index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)",
+    lambda self, indices, values, accumulate=False: torch.ops.aten.index_put_(
+        self, indices, values, accumulate
+    ),
+    doc="Unsafe index_put_ (doesn't issue device asserts)",
+)
+fma = make_prim(
+    "fma(Tensor a, Tensor b, Tensor c) -> Tensor",
+    lambda a, b, c: (a * b) + c,
+    doc="Fused multiply add: fma(a, b, c) -> (a * b) + c without rounding after the multiplication",
+)
+def _low_memory_max_pool2d_with_offsets_aten(
+    self,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+):
+    vals, indices = torch.ops.aten.max_pool2d_with_indices(
+        self, kernel_size, stride, padding, dilation, ceil_mode
+    )
+    input_width = self.shape[-1]
+    kernel_width = kernel_size[1]
+    bh_shape = [1] * self.ndim
+    bh_shape[-2] = -1
+    bh = torch.arange(indices.shape[-2], dtype=torch.int64, device=self.device).view(
+        bh_shape
+    )
+    bw_shape = [1] * self.ndim
+    bw_shape[-1] = -1
+    bw = torch.arange(indices.shape[-1], dtype=torch.int64, device=self.device).view(
+        bw_shape
+    )
+    hbase = bh * stride[0] - padding[0]
+    wbase = bw * stride[1] - padding[1]
+    ih = indices // input_width
+    iw = indices - (ih * input_width)
+    h_inc = ih - hbase
+    w_inc = iw - wbase
+    offsets = h_inc * kernel_width + w_inc
+    return vals, offsets.to(torch.int8)
+def _low_memory_max_pool2d_offsets_to_indices_aten(
+    offsets, kernel_width, input_width, stride, padding
+):
+    offsets = offsets.to(torch.int64)
+    h_inc = offsets // kernel_width
+    w_inc = offsets - (h_inc * kernel_width)
+    bh_shape = [1] * offsets.ndim
+    bh_shape[-2] = -1
+    bh = torch.arange(offsets.shape[-2], dtype=torch.int64, device=offsets.device).view(
+        bh_shape
+    )
+    bw_shape = [1] * offsets.ndim
+    bw_shape[-1] = -1
+    bw = torch.arange(offsets.shape[-1], dtype=torch.int64, device=offsets.device).view(
+        bw_shape
+    )
+    hbase = bh * stride[0] - padding[0]
+    wbase = bw * stride[1] - padding[1]
+    ih = hbase + h_inc
+    iw = wbase + w_inc
+    return ih * input_width + iw
+_low_memory_max_pool2d_with_offsets = make_prim(
+    "_low_memory_max_pool2d_with_offsets(Tensor self, SymInt[2] kernel_size, SymInt[2] stride,  SymInt[2] padding, SymInt[2] dilation, bool ceil_mode) -> (Tensor, Tensor)",  # noqa: B950
+    _low_memory_max_pool2d_with_offsets_aten,
+    return_type=(_prims.RETURN_TYPE.NEW, _prims.RETURN_TYPE.NEW),
+    doc="Instead of returning indices, returns indices offsets.",
+)
+_low_memory_max_pool2d_offsets_to_indices = make_prim(
+    "_low_memory_max_pool2d_offsets_to_indices(Tensor self, SymInt kernel_w, SymInt input_w, SymInt[2] stride, SymInt[2] padding) -> Tensor",  # noqa: B950
+    _low_memory_max_pool2d_offsets_to_indices_aten,
+    doc="Convert small int offsets to regular indices.",
+)

.venv/lib/python3.11/site-packages/torch/_inductor/ir.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/torch/_inductor/jagged_lowerings.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from typing import List, Optional, Tuple, Union
+import sympy
+import torch
+from .ir import Pointwise, TensorBox
+from .lowering import fallback_handler, is_integer_type, register_lowering
+from .virtualized import ops
+# pyre-ignore[2,3]
+def dense_idx_to_jagged_idx(batch_idx, seq_idx, offsets_loader, jagged_len):
+    # jagged_len + 1 is used as the upper bound,
+    # because the last sequence length may be zero
+    begin_idx = ops.indirect_indexing(
+        offsets_loader([batch_idx]),
+        jagged_len + 1,
+    )
+    end_idx = offsets_loader([batch_idx + 1])
+    jagged_idx = begin_idx + seq_idx
+    return jagged_idx, end_idx
+def get_inverse_offsets(
+    offsets: TensorBox,
+    jagged_len: Union[int, sympy.Expr],
+    realize: bool = True,
+) -> TensorBox:
+    """
+    Returns "inverse_offsets" - the inverse of the offsets array.
+    offsets maps batch index (dense) to jagged index (i.e. offset into jagged tensor).
+    inverse_offsets maps jagged index to batch index.
+    e.g. for offsets [0, 3, 4, 9, 10] this will return
+    inverse_offsets = [0, 0, 0, 1, 2, 2, 2, 2, 2, 3]
+    For the given offsets, the computed inverse_offsets are cached
+    on the first call and reused in the further calls.
+    """
+    if hasattr(offsets, "inverse_offsets"):
+        # inverse_offsets are already computed
+        # for these offsets: can reuse
+        return offsets.inverse_offsets
+    # ops.bucketize takes offsets.get_name() which doesn't exist on Pointwise
+    # kernels, i.e. we need to realize it before using. In other words, we need
+    # offsets to be in global memory so that we can binary search over the
+    # entire tensor
+    offsets.realize()
+    device: torch.device = offsets.get_device()
+    dtype: torch.dtype = offsets.get_dtype()
+    # pyre-ignore[2,3]
+    def inner_fn(index):
+        idx = index[0]
+        bucket = ops.bucketize(
+            values=ops.index_expr(idx, dtype),
+            offsets_name=offsets.get_name(),
+            offsets_size=offsets.get_size()[0],
+            indexing_dtype=dtype,
+            right=True,
+        )
+        # ops.bucketize above returns 1-based bucket indices,
+        # but we need 0-based, hence we subtract 1 from batch
+        return bucket - 1
+    inverse_offsets = Pointwise.create(
+        device=device,
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=[jagged_len],
+    )
+    if realize:
+        # "freeze" the node so that it doesn't get inlined downstream.
+        inverse_offsets.realize()
+    # cache inverse_offsets for further reuse
+    offsets.inverse_offsets = inverse_offsets  # type: ignore[attr-defined]
+    return inverse_offsets
+def jagged_idx_to_dense_idx(
+    jagged_idx,  # pyre-ignore[2]
+    inverse_offsets_loader,  # pyre-ignore[2]
+    offsets_loader,  # pyre-ignore[2]
+    batch_size: Union[int, sympy.Expr],
+    max_seq_len: Union[int, sympy.Expr],
+    offsets_dtype: torch.dtype,
+) -> Tuple[sympy.Expr, sympy.Expr]:
+    batch_idx = ops.indirect_indexing(
+        inverse_offsets_loader([jagged_idx]),
+        batch_size + 1,
+    )
+    batch_start = offsets_loader([batch_idx])
+    seq = ops.index_expr(jagged_idx, offsets_dtype) - batch_start
+    # check=False because there may be sequences longer than max_seq_len
+    seq_idx = ops.indirect_indexing(seq, max_seq_len, check=False)
+    return batch_idx, seq_idx
+def register_jagged_ops():
+    # pyre-ignore[56]
+    @register_lowering(torch.ops.aten._jagged_to_padded_dense_forward.default)
+    def _jagged_to_padded_dense_forward(
+        jagged_values: TensorBox,
+        jagged_offsets: List[TensorBox],
+        max_lengths: List[int],  # list of ints/SymInts
+        padding_value: float = 0.0,
+    ) -> TensorBox:
+        device = jagged_values.get_device()
+        dtype = jagged_values.get_dtype()
+        jagged_values_size = jagged_values.get_size()
+        # only handle the common case of a single jagged dimension
+        if (
+            len(jagged_offsets) != 1
+            or device.type != "cuda"
+            or device != jagged_offsets[0].get_device()
+            or len(jagged_values_size) != 2
+            or len(jagged_offsets[0].get_size()) != 1
+            or len(max_lengths) != len(jagged_offsets)
+            or not is_integer_type(jagged_offsets[0])
+        ):
+            return fallback_handler(
+                torch.ops.aten._jagged_to_padded_dense_forward.default,
+                add_to_fallback_set=False,
+            )(
+                jagged_values,
+                jagged_offsets,
+                max_lengths,
+                padding_value,
+            )
+        offsets: TensorBox = jagged_offsets[0]
+        offsets_len = offsets.get_size()[0]
+        offsets_dtype = offsets.get_dtype()
+        batch_size = offsets_len - 1
+        max_seq_len = max_lengths[0]
+        embedding_len = jagged_values_size[1]
+        jagged_len = jagged_values_size[0]
+        output_size = [batch_size, max_seq_len, embedding_len]
+        values_loader = jagged_values.make_loader()
+        offsets_loader = offsets.make_loader()
+        # pyre-ignore[2,3,53]
+        def inner_fn(index):
+            # dense tensor size: [B, N, D]
+            batch_idx, seq_idx, emb_idx = index
+            jagged_idx, end_idx = dense_idx_to_jagged_idx(
+                batch_idx=batch_idx,
+                seq_idx=seq_idx,
+                offsets_loader=offsets_loader,
+                jagged_len=jagged_len,
+            )
+            return ops.masked(
+                ops.lt(
+                    ops.index_expr(jagged_idx, offsets_dtype),
+                    end_idx,
+                ),
+                lambda: values_loader([jagged_idx, emb_idx]),
+                padding_value,
+            )
+        return Pointwise.create(
+            device=device,
+            dtype=dtype,
+            inner_fn=inner_fn,
+            ranges=output_size,
+        )
+    def _dense_to_jagged_forward_impl(
+        fallback_op,  # pyre-ignore[2]
+        dense: TensorBox,
+        jagged_offsets: List[TensorBox],
+        jagged_len: Optional[int] = None,
+    ) -> TensorBox:
+        device = dense.get_device()
+        dtype = dense.get_dtype()
+        dense_size = dense.get_size()
+        # only handle the common case of a single jagged dimension
+        if (
+            len(jagged_offsets) != 1
+            or device.type != "cuda"
+            or device != jagged_offsets[0].get_device()
+            or len(jagged_offsets[0].get_size()) != 1
+            or len(dense_size) != 3
+            or jagged_len is None
+            or not is_integer_type(jagged_offsets[0])
+        ):
+            return fallback_handler(fallback_op, add_to_fallback_set=False)(
+                dense,
+                jagged_offsets,
+                jagged_len,
+            )
+        offsets: TensorBox = jagged_offsets[0]
+        offsets_dtype = offsets.get_dtype()
+        batch_size = dense_size[0]
+        max_seq_len = dense_size[1]
+        embedding_len = dense_size[-1]
+        output_size = [jagged_len, embedding_len]
+        dense_loader = dense.make_loader()
+        offsets_loader = offsets.make_loader()
+        inverse_offsets = get_inverse_offsets(
+            offsets=offsets,
+            jagged_len=jagged_len,
+        )
+        inverse_offsets_loader = inverse_offsets.make_loader()
+        # pyre-ignore[2,3,53]
+        def inner_fn(index):
+            # jagged tensor size: [sum_B(N_B), D]
+            jagged_idx, emb_idx = index
+            batch_idx, seq_idx = jagged_idx_to_dense_idx(
+                jagged_idx=jagged_idx,
+                offsets_loader=offsets_loader,
+                inverse_offsets_loader=inverse_offsets_loader,
+                batch_size=batch_size,
+                max_seq_len=max_seq_len,
+                offsets_dtype=offsets_dtype,
+            )
+            return ops.masked(
+                ops.lt(
+                    ops.index_expr(seq_idx, offsets_dtype),
+                    ops.index_expr(max_seq_len, offsets_dtype),
+                ),
+                lambda: dense_loader([batch_idx, seq_idx, emb_idx]),
+                0.0,  # jagged sequence longer than max_seq_len
+            )
+        return Pointwise.create(
+            device=device,
+            dtype=dtype,
+            inner_fn=inner_fn,
+            ranges=output_size,
+        )
+    # pyre-ignore[56]
+    @register_lowering(torch.ops.aten._padded_dense_to_jagged_forward)
+    def _dense_to_jagged_forward(
+        dense: TensorBox,
+        jagged_offsets: List[TensorBox],
+        jagged_len: Optional[int] = None,
+    ) -> TensorBox:
+        return _dense_to_jagged_forward_impl(
+            fallback_op=torch.ops.aten._padded_dense_to_jagged_forward.default,
+            dense=dense,
+            jagged_offsets=jagged_offsets,
+            jagged_len=jagged_len,
+        )

.venv/lib/python3.11/site-packages/torch/_inductor/lowering.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import csv
+import dataclasses
+import inspect
+import os
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Set, Tuple, TYPE_CHECKING
+from torch._inductor import config
+from torch._inductor.utils import get_benchmark_name
+# Prevent circular import
+if TYPE_CHECKING:
+    from torch._inductor.scheduler import BaseSchedulerNode
+# counter for tracking how many kernels have been generated
+generated_kernel_count = 0
+generated_cpp_vec_kernel_count = 0
+num_bytes_accessed = 0
+nodes_num_elem: List[
+    Tuple[
+        BaseSchedulerNode,
+        int,
+    ]
+] = []
+node_runtimes: List[Tuple[BaseSchedulerNode, float]] = []
+# counters for tracking fusions
+ir_nodes_pre_fusion = 0
+# counters for tracking to_dtype inserted
+cpp_to_dtype_count = 0
+@dataclasses.dataclass
+class CppOuterLoopFusedCount:
+    inner_kernel_number: int
+    local_buffer_number: int = 0
+# The length counts the number of outer loop fusions.
+cpp_outer_loop_fused_inner_counts: List[CppOuterLoopFusedCount] = []
+num_comprehensive_padding = 0
+num_matches_for_scatter_upon_const_tensor = 0
+num_loop_reordering = 0
+# reset all counters
+def reset():
+    global generated_kernel_count
+    global generated_cpp_vec_kernel_count
+    global num_bytes_accessed, nodes_num_elem
+    global ir_nodes_pre_fusion
+    global cpp_to_dtype_count
+    global cpp_outer_loop_fused_inner_counts
+    global num_comprehensive_padding
+    global num_matches_for_scatter_upon_const_tensor
+    global num_loop_reordering
+    generated_kernel_count = 0
+    generated_cpp_vec_kernel_count = 0
+    num_bytes_accessed = 0
+    nodes_num_elem.clear()
+    node_runtimes.clear()
+    ir_nodes_pre_fusion = 0
+    cpp_to_dtype_count = 0
+    cpp_outer_loop_fused_inner_counts.clear()
+    num_comprehensive_padding = 0
+    num_matches_for_scatter_upon_const_tensor = 0
+    num_loop_reordering = 0
+@dataclass
+class CachedMetricsDeltas:
+    """
+    The subset of metrics we want update across cache hits, e.g., the
+    FxGraphCache.
+    """
+    generated_kernel_count: int
+    generated_cpp_vec_kernel_count: int
+    ir_nodes_pre_fusion: int
+    cpp_to_dtype_count: int
+    num_bytes_accessed: int
+    num_matches_for_scatter_upon_const_tensor: int
+def get_metric_fields():
+    return [field.name for field in dataclasses.fields(CachedMetricsDeltas)]
+class CachedMetricsHelper:
+    """
+    A helper class to help calculate and apply counter deltas for those
+    metrics we want to save with cache entries (e.g., FxGraphCache) and
+    apply on a cache hit.
+    """
+    def __init__(self) -> None:
+        self.cached_metrics = {}
+        for metric in get_metric_fields():
+            self.cached_metrics[metric] = globals()[metric]
+    def get_deltas(self) -> CachedMetricsDeltas:
+        delta_metrics = {}
+        for metric in get_metric_fields():
+            delta_metrics[metric] = globals()[metric] - self.cached_metrics[metric]
+        return CachedMetricsDeltas(**delta_metrics)
+    @staticmethod
+    def apply_deltas(delta: CachedMetricsDeltas):
+        for metric in get_metric_fields():
+            globals()[metric] += getattr(delta, metric)
+REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {}
+@dataclass
+class MetricTable:
+    table_name: str
+    column_names: List[str]
+    num_rows_added: int = 0
+    def add_row(self, row_fn):
+        if self.table_name not in enabled_metric_tables():
+            return
+        row_dict = row_fn()
+        assert len(self.column_names) == len(
+            row_dict
+        ), f"{len(self.column_names)} v.s. {len(row_dict)}"
+        assert set(self.column_names) == set(
+            row_dict.keys()
+        ), f"{set(self.column_names)} v.s. {set(row_dict.keys())}"
+        row = [
+            get_benchmark_name(),
+        ]
+        row += [row_dict[column_name] for column_name in self.column_names]
+        self._write_row(row)
+    def output_filename(self):
+        return f"metric_table_{self.table_name}.csv"
+    def write_header(self):
+        filename = self.output_filename()
+        with open(filename, "w") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(["model_name"] + self.column_names)
+    def _write_row(self, row):
+        filename = self.output_filename()
+        if self.num_rows_added == 0 and not os.path.exists(filename):
+            self.write_header()
+        self.num_rows_added += 1
+        for idx, orig_val in enumerate(row):
+            if isinstance(orig_val, float):
+                new_val = f"{orig_val:.6f}"
+            elif orig_val is None:
+                new_val = ""
+            else:
+                new_val = orig_val
+            row[idx] = new_val
+        with open(filename, "a") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(row)
+    @staticmethod
+    def register_table(name, column_names):
+        table = MetricTable(name, column_names)
+        REGISTERED_METRIC_TABLES[name] = table
+MetricTable.register_table(
+    "slow_fusion",
+    [
+        "kernel1_path",
+        "kernel1_latency",
+        "kernel2_path",
+        "kernel2_latency",
+        "fused_kernel_path",
+        "fused_kernel_latency",
+        "slow_down_ratio",
+    ],
+)
+# track the fusion statistics for each graph
+MetricTable.register_table(
+    "graph_stats",
+    [
+        "graph_id",
+        "num_nodes_before_fusion",
+        "num_nodes_after_fusion",
+    ],
+)
+# track the perf difference between persistent reduction and non-persistent
+# reductions
+MetricTable.register_table(
+    "persistent_red_perf",
+    [
+        "kernel1_name",
+        "kernel2_name",
+        "kernel1_latency",
+        "kernel2_latency",
+        "size_hints",
+        "reduction_hint",
+        "speedup",
+    ],
+)
+# Log the fusion failures due to indexing mismatch
+MetricTable.register_table(
+    "fusion_failure_due_to_indexing_mismatch",
+    [
+        "pre_grad_graph_id",
+        "post_grad_graph_id",
+        "node1_name",
+        "node2_name",
+        "node1_debug_str",
+        "node2_debug_str",
+        "common_buffer_names",
+        "failure_reason",
+    ],
+)
+# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint
+MetricTable.register_table(
+    "kernel_metadata",
+    [
+        "kernel_name",
+        "kernel_path",
+        "kernel_category",  # pointwise/reduction/foreach etc.
+        "size_hints",
+        "reduction_hint",
+        "line_of_code",
+        "num_load",
+        "num_store",
+        "num_for_loop",
+        "num_atomic_add",
+        "num_args",
+        # xyz numel can be different to size_hints since size_hints are rounded
+        # up to the nearest power of 2.
+        # Inductor kernel will burn in the xyz numel in kernel code for static
+        # shape kernels.
+        # Logging them will be helpful to find unaligned shape for reduction
+        "xnumel",
+        "ynumel",
+        "rnumel",
+        "kernel_args_num_gb",
+    ],
+)
+def _parse_kernel_fn_code(kernel_module_code):
+    """
+    The kernel_module_code is the python module that contains kernel function code.
+    kernel function is the proper triton kernel function annotated with
+    @triton.jit
+    """
+    from .codecache import PyCodeCache
+    from .wrapper_benchmark import get_triton_kernel
+    mod = PyCodeCache.load(kernel_module_code)
+    kernel = get_triton_kernel(mod)
+    # kernel is a CachingAutotune; kernel.fn is the JITFunction;
+    # kernel.fn.fn is the function being decorate by triton.jit
+    return inspect.getsource(kernel.fn.fn)
+def _parse_kernel_line_of_code(proper_kernel_fn_code):
+    """
+    Return the line of code for the kernel excluding the decorators.
+    """
+    return len(proper_kernel_fn_code.splitlines())
+def _parse_size_hints(kernel_module_code, kernel_category):
+    if kernel_category == "foreach":
+        # foreach kernel does not have size_hints
+        return None
+    m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code)
+    assert m, "size_hints missing!"
+    return m.group(1)
+def _parse_reduction_hint(kernel_category, kernel_module_code):
+    if kernel_category not in ("reduction", "persistent_reduction"):
+        return None
+    m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code)
+    assert m, "reduction_hint not found in kernel source code!"
+    return m.group(1)
+def _count_pattern(proper_kernel_fn_code, pattern):
+    return proper_kernel_fn_code.count(pattern)
+def _count_args(proper_kernel_fn_code):
+    def_line = proper_kernel_fn_code.splitlines()[0]
+    assert def_line.startswith("def ")
+    start_idx = def_line.index("(")
+    end_idx = def_line.index("):")
+    decl_csv = def_line[start_idx + 1 : end_idx]
+    comps = decl_csv.split(",")
+    return len(comps)
+def _parse_proper_kernel_fn_code(kernel_fn_code):
+    """
+    Skip decorators.
+    """
+    start_pos = kernel_fn_code.index("def ")
+    return kernel_fn_code[start_pos:]
+def _parse_numel(proper_kernel_fn_code, numel_arg_name):
+    m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code)
+    if m:
+        return int(m.group(1))
+    else:
+        return None
+def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category):
+    """
+    inductor meta looks like:
+        inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0},
+    """
+    m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code)
+    if m:
+        return float(m.group(1))
+    else:
+        """
+        There are a few cases that kernel_num_gdb field can be missing:
+        1. the field will be missing if config.benchmark_kernel and
+           config.profile_bandwidth are false
+        2. even if config.benchmark_kernel or config.profile_bandwidth is true.
+           foreach kernel does not have kernel_num_gb field in the metadata
+        """
+        return None
+def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code):
+    """
+    An utility to log kernel metadata. We may parse metadata from kernel source code here.
+    It's fine to parse the generated kernel code here since the logging is
+    disabled by default. It would hurt compilation time.
+    """
+    from .wrapper_benchmark import get_kernel_category_by_source_code
+    kernel_category = get_kernel_category_by_source_code(kernel_module_code)
+    reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code)
+    size_hints = _parse_size_hints(kernel_module_code, kernel_category)
+    kernel_fn_code = _parse_kernel_fn_code(kernel_module_code)
+    proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code)
+    # the line of code excluding the decortors
+    kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code)
+    get_metric_table("kernel_metadata").add_row(
+        lambda: {
+            "kernel_name": kernel_name,
+            "kernel_path": kernel_path,
+            "kernel_category": kernel_category,
+            "size_hints": size_hints,
+            "reduction_hint": reduction_hint,
+            "line_of_code": kernel_line_of_code,
+            "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"),
+            "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"),
+            "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "),
+            "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"),
+            "num_args": _count_args(proper_kernel_fn_code),
+            "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"),
+            "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"),
+            "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"),
+            "kernel_args_num_gb": _parse_kernel_args_num_gb(
+                kernel_fn_code, kernel_category
+            ),
+        }
+    )
+def purge_old_log_files():
+    """
+    Purge the old log file at the beginning when the benchmark script runs.
+    Should do it in the parent process rather than the child processes running
+    each individual model.
+    """
+    for name, table in REGISTERED_METRIC_TABLES.items():
+        if name in enabled_metric_tables():
+            filename = table.output_filename()
+            if os.path.exists(filename):
+                os.unlink(filename)
+            table.write_header()
+@lru_cache
+def enabled_metric_tables() -> Set[str]:
+    config_str = config.enabled_metric_tables
+    enabled = set()
+    for name in config_str.split(","):
+        name = name.strip()
+        if not name:
+            continue
+        assert (
+            name in REGISTERED_METRIC_TABLES
+        ), f"Metric table name {name} is not registered"
+        enabled.add(name)
+    return enabled
+def is_metric_table_enabled(name):
+    return name in enabled_metric_tables()
+def get_metric_table(name):
+    assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
+    return REGISTERED_METRIC_TABLES[name]

.venv/lib/python3.11/site-packages/torch/_inductor/mkldnn_ir.py ADDED Viewed

	@@ -0,0 +1,1881 @@

+# mypy: allow-untyped-defs
+from typing import Any, List, Optional
+import sympy
+import torch
+from torch._prims_common import make_channels_last_strides_for
+from torch.utils._ordered_set import OrderedSet
+from .ir import (
+    ExternKernelAlloc,
+    FixedLayout,
+    FlexibleLayout,
+    ir_node_to_tensor,
+    IRNode,
+    is_contiguous_storage_and_layout,
+    Layout,
+    may_convert_to_optional,
+    MultiOutput,
+    MultiOutputLayout,
+    MutationOutput,
+    NoneLayout,
+    TensorBox,
+)
+from .utils import convert_shape_to_inductor, pad_listlike
+from .virtualized import V
+def _prepare_convolution_fusion_create(
+    cls,
+    x: "TensorBox",
+    weight: "TensorBox",
+    bias: "TensorBox",
+    padding: List[int],
+    stride: List[int],
+    dilation: List[int],
+    groups: int,
+    transposed: bool = False,
+    output_padding: Optional[List[int]] = None,
+):
+    """
+    This function is a helper function to prepare inputs, layout and constant args
+    for convolution post-op fusion's create function, including deciding the output
+    layout (channels first or channels last), realizing inputs and make them etc. The
+    function only supports the CPU device since conv post-op fusion kernel is only
+    supported on CPU right now.
+    """
+    # Port from aten/src/ATen/native/ConvUtils.h: _conv_input_size
+    def _conv_input_size(
+        output_size, weight_size, padding, output_padding, stride, dilation, groups
+    ):
+        assert len(output_size) == len(weight_size), "Expect input dim == weight dim"
+        dim = len(output_size)
+        assert dim > 2, "Expect input dim > 2"
+        BATCH_DIM = 0
+        WEIGHT_INPUT_CHANNELS_DIM = 1
+        input_size = []
+        input_size.append(output_size[BATCH_DIM])
+        input_size.append(weight_size[WEIGHT_INPUT_CHANNELS_DIM] * groups)
+        for d in range(2, dim):
+            kernel = (weight_size[d] - 1) * dilation[d - 2] + 1
+            input_size_d = (
+                (output_size[d] - 1) * stride[d - 2]
+                - (padding[d - 2] * 2)
+                + kernel
+                + output_padding[d - 2]
+            )
+            input_size.append(input_size_d)
+        return list(map(int, input_size))
+    # The size of prepacked_weight is the prepacked weight size of deconv:
+    #   Groups > 1:  [g*o, i/g, ...]
+    #   Groups == 1: [o, i, ...]
+    # Returns original weight size in [i, o, ...]
+    def _original_deconv_weight_size(
+        prepacked_weight,
+        groups,
+    ):
+        prepacked_weight_size = prepacked_weight.size()
+        dim = len(prepacked_weight_size)
+        assert dim > 2, "Expect weight dim > 2"
+        if groups > 1:
+            weight_size = []
+            weight_size.append(prepacked_weight_size[1] * groups)
+            weight_size.append(prepacked_weight_size[0] / groups)
+            for d in range(2, dim):
+                weight_size.append(prepacked_weight_size[d])
+        else:
+            weight_size = prepacked_weight.transpose(0, 1).size()
+        return weight_size
+    x.realize()
+    weight.realize()
+    if bias is not None:
+        bias.realize()
+    with V.graph.fake_mode:
+        # TODO <Leslie> cleaned up the fake_tensor trace as Linear implementation
+        x_fake = ir_node_to_tensor(x, guard_shape=True)
+        weight_fake = ir_node_to_tensor(weight, guard_shape=True)
+        dims = len(x_fake.size()) - 2
+        assert 0 < len(padding) <= dims
+        assert 0 < len(dilation) <= dims
+        assert 0 < len(stride) <= dims
+        padding = pad_listlike(padding, dims)
+        dilation = pad_listlike(dilation, dims)
+        stride = pad_listlike(stride, dims)
+        if output_padding is None:
+            output_padding = pad_listlike([0], dims)
+        else:
+            assert 0 < len(output_padding) <= dims
+            output_padding = pad_listlike(output_padding, dims)
+        assert isinstance(groups, (int, sympy.core.numbers.Integer))
+        if transposed:
+            # When transposed, the size of the prepacked oneDNN weight is different
+            # from the PyTorch weight. We're not able to run aten conv with such
+            # size. We infer the output size from the input params here:
+            weight_size = _original_deconv_weight_size(weight_fake, groups)
+            input_size = x_fake.size()
+            output_size = _conv_input_size(
+                input_size,
+                weight_size,
+                padding,
+                output_padding,
+                stride,
+                dilation,
+                groups,
+            )
+        else:
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.convolution(
+                x_fake,
+                weight_fake,
+                bias_fake,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                groups,
+            )
+            output_size = output.size()
+        req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
+        req_stride_order = [len(req_stride_order)] + req_stride_order
+    x = cls.require_stride_order(x, req_stride_order)
+    # We won't do weight prepack for Conv if dynamic_shapes.
+    # In static shape cases, since weight is prepacked, we'll always force output to be channels last in the Conv kernel.
+    # In dynamic shape cases, for input with channels = 1, like tensor of size (s0, 1, 28, 28) and stride (784, 784, 28, 1),
+    # x = cls.require_stride_order(x, req_stride_order) where req_stride_order is in the channels last order
+    # won't change the stride of this tensor since stride for dimensions of size 1 is ignored. While in Conv kernel,
+    # this tensor is considered as channels first and the output will be in contiguous format.
+    # To align the behavior of the Conv kernel, we set the output_stride in such case to be contiguous instead of channels last.
+    dynamic_shapes = not all(isinstance(i, int) for i in (output_size))
+    if dynamic_shapes and is_contiguous_storage_and_layout(x):
+        output_stride = FlexibleLayout.contiguous_strides(output_size)
+    else:
+        output_stride = make_channels_last_strides_for(output_size)
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
+    kernel_layout = FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        convert_shape_to_inductor(output_size),
+        convert_shape_to_inductor(output_stride),
+    )
+    constant_args = [padding, stride, dilation, groups]
+    if transposed:
+        constant_args.insert(1, output_padding)
+    if bias is not None:
+        inputs.append(bias)
+    else:
+        constant_args.insert(0, bias)
+    return inputs, constant_args, kernel_layout, req_stride_order
+def _prepare_linear_fusion_create(
+    cls,
+    x: "TensorBox",
+    weight: "TensorBox",
+    bias: "TensorBox",
+):
+    """
+    This function is a helper function to prepare inputs, layout and constant args
+    for linear post-op fusion's create function. The function only supports the CPU device
+    since linear post-op fusion kernel is only supported on CPU right now.
+    """
+    x.realize()
+    weight.realize()
+    if bias is not None:
+        bias.realize()
+    *m, _ = x.get_size()
+    # The weight has been transposed during the qlinear weight prepack process.
+    # https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/
+    # aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp#L291
+    _, oc = weight.get_size()
+    output_size = list(m) + [oc]
+    req_stride_order = list(reversed(range(len(x.get_size()))))
+    x = cls.require_stride_order(x, req_stride_order)
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
+    output_stride = FlexibleLayout.contiguous_strides(output_size)
+    kernel_layout = FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        output_size,
+        output_stride,
+    )
+    constant_args: List[Any] = []
+    if bias is not None:
+        inputs.append(bias)
+    else:
+        constant_args.insert(0, bias)
+    return inputs, constant_args, kernel_layout, req_stride_order
+class ConvolutionUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._convolution_pointwise.default,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view attr,
+                torch::List<std::optional<at::Scalar>> scalars,
+                std::optional<c10::string_view> algorithm)"""
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            op_overload=self.op_overload,
+            raw_args=[*self.inputs, *self.constant_args],
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        attr,
+        scalars: Optional[List[Any]],
+        algorithm,
+    ):
+        (inputs, constant_args, kernel_layout, _) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        constant_args = constant_args + [
+            attr,
+            may_convert_to_optional(scalars),
+            algorithm,
+        ]
+        return ConvolutionUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+class ConvolutionBinary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        cpp_constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._convolution_pointwise.binary,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& other_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view binary_attr,
+                std::optional<at::Scalar> alpha,
+                std::optional<c10::string_view> unary_attr,
+                torch::List<std::optional<at::Scalar>> unary_scalars,
+                std::optional<c10::string_view> unary_algorithm)"""
+        self.cpp_constant_args = cpp_constant_args
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+            self.op_overload,
+            [*self.inputs, *self.constant_args],
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List[Any]],
+        unary_algorithm: Optional[str],
+    ):
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.require_stride_order(other, req_stride_order)
+        inputs.insert(1, other)
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+        return ConvolutionBinary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+class ConvolutionBinaryInplace(ExternKernelAlloc):
+    def __init__(
+        self,
+        kernel_layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        # Due to constrain of op.call, other (Tensor&) should be at input[0]
+        reordered_inputs = [inputs[1], inputs[0]] + inputs[2:]
+        super().__init__(
+            kernel_layout,
+            reordered_inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._convolution_pointwise_.binary,
+        )
+        # TODO: op.call: input[0] should be at::Tensor&
+        self.cpp_op_schema = """
+            at::Tensor&(
+                at::Tensor& other_t,
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view binary_attr,
+                std::optional<at::Scalar> alpha,
+                std::optional<c10::string_view> unary_attr,
+                torch::List<std::optional<at::Scalar>> unary_scalars,
+                std::optional<c10::string_view> unary_algorithm)"""
+        self.mutation_outputs = [
+            MutationOutput(NoneLayout(inputs[0].get_device()), inputs[0], self),
+            MutationOutput(NoneLayout(inputs[1].get_device()), inputs[1], self),
+        ]
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+            self.op_overload,
+            [*self.inputs, *self.constant_args],
+        )
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+        return OrderedSet()
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List[Any]],
+        unary_algorithm: Optional[str],
+    ):
+        (
+            inputs,
+            constant_args,
+            _,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.require_stride_order(other, req_stride_order)
+        inputs.insert(1, other)
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+        packed = ConvolutionBinaryInplace(
+            kernel_layout=NoneLayout(inputs[1].get_device()),  # type: ignore[arg-type]
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+        # This op mutates in place which means that the result is not the
+        # target but rather the input that is being mutated
+        # init reorders the inputs, so inputs[1] becomes packed.inputs[0]
+        return packed.inputs[0]
+class ConvolutionTransposeUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._convolution_transpose_pointwise.default,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef output_padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view attr,
+                torch::List<std::optional<at::Scalar>> scalars,
+                std::optional<c10::string_view> algorithm)"""
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        output_padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups_: int,
+        attr,
+        scalars: Optional[List[Any]],
+        algorithm,
+    ):
+        transposed = True
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            _,
+        ) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups_,
+            transposed,
+            output_padding_,
+        )
+        constant_args = constant_args + [
+            attr,
+            may_convert_to_optional(scalars),
+            algorithm,
+        ]
+        return ConvolutionTransposeUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+class QConvPointWisePT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp]
+            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp]
+            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = len(inputs) == 5
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.onednn.qconv2d_pointwise.default,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                at::Tensor act,
+                double act_scale,
+                int64_t act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                std::optional<at::Tensor> bias,
+                torch::List<int64_t> stride,
+                torch::List<int64_t> padding,
+                torch::List<int64_t> dilation,
+                int64_t groups,
+                double output_scale,
+                int64_t output_zero_point,
+                std::optional<c10::ScalarType> output_dtype,
+                c10::string_view attr,
+                torch::List<std::optional<at::Scalar>> scalars,
+                std::optional<c10::string_view> algorithm)"""
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        # The raw_args setup can be skipped if there is a C shim implementation
+        args = [x.codegen_reference() for x in self.inputs]
+        const_arg_names = [
+            "x_scale",
+            "x_zero_point",
+            "stride",
+            "padding",
+            "dilation",
+            "groups",
+            "output_scale",
+            "output_zero_point",
+            "output_dtype",
+            "attr",
+            "scalars",
+            "algorithm",
+        ]
+        if not self.has_bias:
+            const_arg_names.insert(2, "bias")
+        const_args = list(self.codegen_const_args(const_arg_names))
+        x = args[0]
+        x_raw = self.inputs[0]
+        packed_weight = args[1]
+        packed_weight_raw = self.inputs[1]
+        bias = args[2] if self.has_bias else const_args[2]
+        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[2]
+        w_scale, w_zp = args[-2], args[-1]
+        w_scale_raw, w_zp_raw = self.inputs[-2], self.inputs[-1]
+        (
+            x_scale,
+            x_zp,
+        ) = const_args[:2]
+        (
+            x_scale_raw,
+            x_zp_raw,
+        ) = self.constant_args[:2]
+        (
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ) = const_args[-10:]
+        (
+            stride_raw,
+            padding_raw,
+            dilation_raw,
+            groups_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        ) = self.constant_args[-10:]
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        raw_args = (
+            x_raw,
+            x_scale_raw,
+            x_zp_raw,
+            packed_weight_raw,
+            w_scale_raw,
+            w_zp_raw,
+            bias_raw,
+            stride_raw,
+            padding_raw,
+            dilation_raw,
+            groups_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            op_overload=self.op_overload,
+            raw_args=raw_args,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    @classmethod
+    def create(
+        cls,
+        qx: "TensorBox",
+        x_scale: float,
+        x_zero_point: int,
+        qw: "TensorBox",  # qw
+        w_scale: "TensorBox",
+        w_zero_point: "TensorBox",
+        bias: "TensorBox",
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        groups: int,
+        output_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        transposed = False
+        output_padding = None
+        (inputs, constant_args, kernel_layout, _) = _prepare_convolution_fusion_create(
+            cls,
+            qx,
+            qw,
+            bias,
+            padding,
+            stride,
+            dilation,
+            groups,
+            transposed,
+            output_padding,
+        )
+        # swap padding and stride to align with functional conv arg order
+        if bias is None:
+            constant_args[1], constant_args[2] = constant_args[2], constant_args[1]
+        else:
+            constant_args[0], constant_args[1] = constant_args[1], constant_args[0]
+        w_scale.realize()
+        w_zero_point.realize()
+        inputs = inputs + [w_scale, w_zero_point]
+        constant_args = (
+            [
+                x_scale,
+                x_zero_point,
+            ]
+            + constant_args
+            + [
+                output_scale,
+                output_zero_point,
+                output_dtype,
+                attr,
+                may_convert_to_optional(scalars),
+                algorithm,
+            ]
+        )
+        assert output_dtype is not None
+        if output_dtype in [torch.float32, torch.bfloat16]:
+            # in _prepare_convolution_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set output_dtype is not None, the output buf should be output_dtype instead of uint8.
+            kernel_layout.dtype = output_dtype
+        return QConvPointWisePT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+class QConvPointWiseBinaryPT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        """
+        Needs input/weight/output qparams
+        if bias is not None
+            - inputs = [x, w, b, accum, w_scale, w_zp]
+            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_scale, o_zp,
+            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, accum, w_scale, w_zp]
+            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
+            accum_zp, o_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = len(inputs) == 6
+        self.idx_for_inplace_sum = 3 if self.has_bias else 2
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.onednn.qconv2d_pointwise.binary,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                at::Tensor act,
+                double act_scale,
+                int64_t act_zero_point,
+                at::Tensor accum,
+                double accum_scale,
+                int64_t accum_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                std::optional<at::Tensor> bias,
+                torch::List<int64_t> stride,
+                torch::List<int64_t> padding,
+                torch::List<int64_t> dilation,
+                int64_t groups,
+                double output_scale,
+                int64_t output_zero_point,
+                std::optional<c10::ScalarType> output_dtype,
+                c10::string_view binary_attr,
+                std::optional<at::Scalar> alpha,
+                std::optional<c10::string_view> attr,
+                torch::List<std::optional<at::Scalar>> scalars,
+                std::optional<c10::string_view> algorithm)"""
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        # The raw_args setup can be skipped if there is a C shim implementation
+        args = [x.codegen_reference() for x in self.inputs]
+        const_arg_names = [
+            "x_scale",
+            "x_zero_point",
+            "accum_scale",
+            "accum_zero_point",
+            "stride",
+            "padding",
+            "dilation",
+            "groups",
+            "output_scale",
+            "output_zero_point",
+            "output_dtype",
+            "binary_attr",
+            "alpha",
+            "unary_attr",
+            "unary_scalars",
+            "unary_algorithm",
+        ]
+        if not self.has_bias:
+            const_arg_names.insert(4, "bias")
+        const_args = list(self.codegen_const_args(const_arg_names))
+        x = args[0]
+        x_raw = self.inputs[0]
+        packed_weight = args[1]
+        packed_weight_raw = self.inputs[1]
+        bias = args[2] if self.has_bias else const_args[4]
+        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[4]
+        accum, w_scale, w_zp = args[-3], args[-2], args[-1]
+        accum_raw, w_scale_raw, w_zp_raw = (
+            self.inputs[-3],
+            self.inputs[-2],
+            self.inputs[-1],
+        )
+        (
+            x_scale,
+            x_zp,
+            accum_scale,
+            accum_zp,
+        ) = const_args[:4]
+        (
+            x_scale_raw,
+            x_zp_raw,
+            accum_scale_raw,
+            accum_zp_raw,
+        ) = self.constant_args[:4]
+        (
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_scale,
+            o_zp,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ) = const_args[-12:]
+        (
+            stride_raw,
+            padding_raw,
+            dilation_raw,
+            groups_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            binary_attr_raw,
+            alpha_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        ) = self.constant_args[-12:]
+        conv_args = (
+            x,
+            x_scale,
+            x_zp,
+            accum,
+            accum_scale,
+            accum_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_scale,
+            o_zp,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        raw_args = (
+            x_raw,
+            x_scale_raw,
+            x_zp_raw,
+            accum_raw,
+            accum_scale_raw,
+            accum_zp_raw,
+            packed_weight_raw,
+            w_scale_raw,
+            w_zp_raw,
+            bias_raw,
+            stride_raw,
+            padding_raw,
+            dilation_raw,
+            groups_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            binary_attr_raw,
+            alpha_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            conv_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+            op_overload=self.op_overload,
+            raw_args=raw_args,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    def get_mutation_names(self):
+        return [self.inputs[self.idx_for_inplace_sum].get_name()]
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+        return OrderedSet()
+    @classmethod
+    def create(
+        cls,
+        qx: "TensorBox",
+        x_scale,
+        x_zero_point,
+        qaccum: "TensorBox",
+        accum_scale,
+        accum_zero_point,
+        qw: "TensorBox",  # packed_weight
+        w_scale,
+        w_zero_point,
+        bias: "TensorBox",
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        groups: int,
+        output_scale: "TensorBox",
+        output_zero_point: "TensorBox",
+        output_dtype,
+        binary_attr,
+        alpha,
+        unary_attr,
+        unary_scalars,
+        unary_algorithm,
+    ):
+        transposed = False
+        output_padding = None
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls,
+            qx,
+            qw,
+            bias,
+            padding,
+            stride,
+            dilation,
+            groups,
+            transposed,
+            output_padding,
+        )
+        qaccum = cls.require_stride_order(qaccum, req_stride_order)
+        inputs.append(qaccum)
+        # swap padding and stride to align with functional conv arg order
+        if bias is None:
+            constant_args[1], constant_args[2] = constant_args[2], constant_args[1]
+        else:
+            constant_args[0], constant_args[1] = constant_args[1], constant_args[0]
+        w_scale.realize()
+        w_zero_point.realize()
+        inputs = inputs + [w_scale, w_zero_point]
+        constant_args = (
+            [
+                x_scale,
+                x_zero_point,
+                accum_scale,
+                accum_zero_point,
+            ]
+            + constant_args
+            + [
+                output_scale,
+                output_zero_point,
+                output_dtype,
+                binary_attr,
+                alpha,
+                unary_attr,
+                may_convert_to_optional(unary_scalars),
+                unary_algorithm,
+            ]
+        )
+        assert (
+            binary_attr == "sum"
+        ), "For now, only post op sum is supported in QConvPointWiseBinaryPT2E."
+        V.graph.mark_buffer_mutated(qaccum.get_name())
+        packed = QConvPointWiseBinaryPT2E(
+            layout=NoneLayout(qaccum.get_device()),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+        # Return accum since it has been inplace changed.
+        return packed.inputs[packed.idx_for_inplace_sum]
+class MKLPackedLinear(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkl._mkl_linear.default,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& self,
+                const at::Tensor& mkl_weight_t,
+                const at::Tensor& origin_weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                const int64_t prepack_batch_size)"""
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+    @classmethod
+    def create(cls, x, packed_w, orig_w, B, batch_size):
+        x = cls.require_stride1(cls.realize_input(x))
+        orig_w = cls.require_stride1(cls.realize_input(orig_w))
+        *m, _ = x.get_size()
+        oc, _ = orig_w.get_size()
+        output_size = list(m) + [oc]
+        output_stride = FlexibleLayout.contiguous_strides(output_size)
+        inputs = [x, packed_w, orig_w]
+        constant_args = [batch_size]
+        if B is not None:
+            inputs += [B]
+        else:
+            constant_args.insert(0, None)
+        return MKLPackedLinear(
+            layout=FixedLayout(
+                x.get_device(), x.get_dtype(), output_size, output_stride
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+class LinearUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._linear_pointwise.default,
+        )
+        self.cpp_kernel_key = "linear_pointwise"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                c10::string_view attr,
+                torch::List<std::optional<at::Scalar>> scalars,
+                std::optional<c10::string_view> algorithm)"""
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+    @classmethod
+    def create(cls, x, w, B, attr, scalars, algorithm):
+        x = cls.require_contiguous(cls.realize_input(x))
+        w = cls.require_contiguous(cls.realize_input(w))
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+        inputs = [x, w]
+        constant_args = [attr, scalars if scalars else [-1], algorithm]
+        if B is not None:
+            B = cls.require_contiguous(cls.realize_input(B))
+            inputs.append(B)
+        else:
+            constant_args.insert(0, None)
+        return LinearUnary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+    def apply_constraint(self):
+        pass
+class LinearBinary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._linear_pointwise.binary"
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.mkldnn._linear_pointwise.binary,
+        )
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& other_t,
+                const at::Tensor& weight_t,
+                const std::optional<at::Tensor>& bias_opt,
+                c10::string_view attr)
+        """
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+    @classmethod
+    def create(cls, x, y, w, B, attr):
+        x = cls.require_contiguous(cls.realize_input(x))
+        y = cls.require_contiguous(cls.realize_input(y))
+        w = cls.require_contiguous(cls.realize_input(w))
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+        inputs = [x, y, w]
+        constant_args = [attr]
+        if B is not None:
+            B = cls.require_contiguous(cls.realize_input(B))
+            inputs.append(B)
+        else:
+            constant_args.insert(0, B)
+        return LinearBinary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+    def apply_constraint(self):
+        pass
+class QLinearPointwisePT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        has_bias=True,
+        x_scale_zp_are_tensors=False,
+    ) -> None:
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp]
+            - const_args is: [x_scale, x_zp, o_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp]
+            - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = has_bias
+        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.onednn.qlinear_pointwise.tensor
+            if x_scale_zp_are_tensors
+            else torch.ops.onednn.qlinear_pointwise.default,
+        )
+        x_scale_type_str, x_zp_type_str = (
+            ("at::Tensor", "at::Tensor")
+            if x_scale_zp_are_tensors
+            else ("double", "int64_t")
+        )
+        self.cpp_op_schema = f"""
+            at::Tensor(
+                at::Tensor act,
+                {x_scale_type_str} act_scale,
+                {x_zp_type_str} act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                std::optional<at::Tensor> bias,
+                double output_scale,
+                int64_t output_zero_point,
+                std::optional<c10::ScalarType> output_dtype,
+                c10::string_view post_op_name,
+                torch::List<std::optional<at::Scalar>> post_op_args,
+                c10::string_view post_op_algorithm)"""
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        # The raw_args setup can be skipped if there is a C shim implementation
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+        x = args[0]
+        x_raw = self.inputs[0]
+        packed_weight = args[1]
+        packed_weight_raw = self.inputs[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[0]
+        w_scale, w_zp = args[-2], args[-1]
+        w_scale_raw, w_zp_raw = self.inputs[-2], self.inputs[-1]
+        if self.x_scale_zp_are_tensors:
+            assert len(args) >= 4
+            x_scale, x_zp = args[-4], args[-3]
+            x_scale_raw, x_zp_raw = self.inputs[-4], self.inputs[-3]
+            (
+                o_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-6:]
+            (
+                o_scale_raw,
+                o_zp_raw,
+                output_dtype_raw,
+                unary_attr_raw,
+                unary_scalars_raw,
+                unary_algorithm_raw,
+            ) = self.constant_args[-6:]
+        else:
+            assert len(const_args) >= 8
+            (
+                x_scale,
+                x_zp,
+                o_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-8:]
+            (
+                x_scale_raw,
+                x_zp_raw,
+                o_scale_raw,
+                o_zp_raw,
+                output_dtype_raw,
+                unary_attr_raw,
+                unary_scalars_raw,
+                unary_algorithm_raw,
+            ) = self.constant_args[-8:]
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            o_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        raw_args = (
+            x_raw,
+            x_scale_raw,
+            x_zp_raw,
+            packed_weight_raw,
+            w_scale_raw,
+            w_zp_raw,
+            bias_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+            self.op_overload,
+            raw_args,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    @classmethod
+    def create(
+        cls,
+        qx: "TensorBox",
+        x_scale: float,
+        x_zero_point: int,
+        qw: "TensorBox",  # packed_weight
+        w_scale: "TensorBox",
+        w_zero_point: "TensorBox",
+        bias: "TensorBox",
+        output_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        post_op_name,
+        post_op_args,
+        post_op_algorithm,
+    ):
+        (inputs, constant_args, kernel_layout, _) = _prepare_linear_fusion_create(
+            cls,
+            qx,
+            qw,
+            bias,
+        )
+        if isinstance(x_scale, TensorBox) and isinstance(x_zero_point, TensorBox):
+            x_scale.realize()
+            x_zero_point.realize()
+            inputs = inputs + [x_scale, x_zero_point]
+            x_scale_zp_are_tensors = True
+        else:
+            assert isinstance(x_scale, float) and isinstance(x_zero_point, int)
+            constant_args = constant_args + [x_scale, x_zero_point]
+            x_scale_zp_are_tensors = False
+        w_scale.realize()
+        w_zero_point.realize()
+        inputs = inputs + [w_scale, w_zero_point]
+        constant_args = constant_args + [
+            output_scale,
+            output_zero_point,
+            output_dtype,
+            post_op_name,
+            may_convert_to_optional(post_op_args),
+            post_op_algorithm,
+        ]
+        assert output_dtype is not None
+        if output_dtype in [torch.float32, torch.bfloat16]:
+            # in _prepare_linear_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set fp32_output, the output buf should be dtype float32 instead of uint8.
+            kernel_layout.dtype = output_dtype
+        return QLinearPointwisePT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            has_bias=(bias is not None),
+            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+        )
+class QLinearPointwiseBinaryPT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        has_bias=True,
+        x_scale_zp_are_tensors=False,
+    ) -> None:
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp, x2]
+            - const_args is: [x_scale, x_zp, o_scale, o_zp,
+              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp, x2]
+            - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
+              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = has_bias
+        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.onednn.qlinear_pointwise.binary_tensor
+            if x_scale_zp_are_tensors
+            else torch.ops.onednn.qlinear_pointwise.binary,
+        )
+        x_scale_type_str, x_zp_type_str = (
+            ("at::Tensor", "at::Tensor")
+            if x_scale_zp_are_tensors
+            else ("double", "int64_t")
+        )
+        self.cpp_op_schema = f"""
+            at::Tensor(
+                at::Tensor act,
+                {x_scale_type_str} act_scale,
+                {x_zp_type_str} act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                std::optional<at::Tensor> other,
+                std::optional<at::Tensor> bias,
+                double inv_output_scale,
+                int64_t output_zero_point,
+                std::optional<c10::ScalarType> output_dtype,
+                double other_scale,
+                int64_t other_zero_point,
+                c10::string_view binary_post_op,
+                double binary_alpha,
+                c10::string_view unary_post_op,
+                torch::List<std::optional<at::Scalar>> unary_post_op_args,
+                c10::string_view unary_post_op_algorithm)"""
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        # The raw_args setup can be skipped if there is a C shim implementation
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+        x = args[0]
+        x_raw = self.inputs[0]
+        packed_weight = args[1]
+        packed_weight_raw = self.inputs[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[0]
+        w_scale, w_zp, other = args[-3], args[-2], args[-1]
+        w_scale_raw, w_zp_raw, other_raw = (
+            self.inputs[-3],
+            self.inputs[-2],
+            self.inputs[-1],
+        )
+        if self.x_scale_zp_are_tensors:
+            assert len(args) >= 5
+            x_scale, x_zp = args[-5], args[-4]
+            x_scale_raw, x_zp_raw = self.inputs[-5], self.inputs[-4]
+            (
+                o_scale,
+                o_zp,
+                output_dtype,
+                other_scale,
+                other_zp,
+                binary_attr,
+                alpha,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-10:]
+            (
+                o_scale_raw,
+                o_zp_raw,
+                output_dtype_raw,
+                other_scale_raw,
+                other_zp_raw,
+                binary_attr_raw,
+                alpha_raw,
+                unary_attr_raw,
+                unary_scalars_raw,
+                unary_algorithm_raw,
+            ) = self.constant_args[-10:]
+        else:
+            assert len(const_args) >= 8
+            (
+                x_scale,
+                x_zp,
+                o_scale,
+                o_zp,
+                output_dtype,
+                other_scale,
+                other_zp,
+                binary_attr,
+                alpha,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-12:]
+            (
+                x_scale_raw,
+                x_zp_raw,
+                o_scale_raw,
+                o_zp_raw,
+                output_dtype_raw,
+                other_scale_raw,
+                other_zp_raw,
+                binary_attr_raw,
+                alpha_raw,
+                unary_attr_raw,
+                unary_scalars_raw,
+                unary_algorithm_raw,
+            ) = self.constant_args[-12:]
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            other,
+            bias,
+            o_scale,
+            o_zp,
+            output_dtype,
+            other_scale,
+            other_zp,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        raw_args = (
+            x_raw,
+            x_scale_raw,
+            x_zp_raw,
+            packed_weight_raw,
+            w_scale_raw,
+            w_zp_raw,
+            other_raw,
+            bias_raw,
+            o_scale_raw,
+            o_zp_raw,
+            output_dtype_raw,
+            other_scale_raw,
+            other_zp_raw,
+            binary_attr_raw,
+            alpha_raw,
+            unary_attr_raw,
+            unary_scalars_raw,
+            unary_algorithm_raw,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+            self.op_overload,
+            raw_args,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+    def get_mutation_names(self):
+        binary_post_op = self.constant_args[-5]
+        if binary_post_op == "sum":
+            return [self.inputs[-1].get_name()]
+        else:
+            return []
+    @classmethod
+    def create(
+        cls,
+        qx: "TensorBox",
+        x_scale: float,
+        x_zero_point: int,
+        qw: "TensorBox",  # packed_weight
+        w_scale: "TensorBox",
+        w_zero_point: "TensorBox",
+        other: "TensorBox",
+        bias: "TensorBox",
+        output_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        other_scale,
+        other_zp,
+        binary_post_op,
+        binary_alpha,
+        unary_post_op,
+        unary_post_op_args,
+        unary_post_op_algorithm,
+    ):
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_linear_fusion_create(
+            cls,
+            qx,
+            qw,
+            bias,
+        )
+        if isinstance(x_scale, TensorBox) and isinstance(x_zero_point, TensorBox):
+            x_scale.realize()
+            x_zero_point.realize()
+            inputs = inputs + [x_scale, x_zero_point]
+            x_scale_zp_are_tensors = True
+        else:
+            assert isinstance(x_scale, float) and isinstance(x_zero_point, int)
+            constant_args = constant_args + [x_scale, x_zero_point]
+            x_scale_zp_are_tensors = False
+        w_scale.realize()
+        w_zero_point.realize()
+        inputs = inputs + [w_scale, w_zero_point]
+        if binary_post_op == "sum":
+            other = cls.require_stride_order(other, req_stride_order)
+        inputs.append(other)
+        constant_args = constant_args + [
+            output_scale,
+            output_zero_point,
+            output_dtype,
+            other_scale,
+            other_zp,
+            binary_post_op,
+            binary_alpha,
+            unary_post_op,
+            may_convert_to_optional(unary_post_op_args),
+            unary_post_op_algorithm,
+        ]
+        if binary_post_op == "sum":
+            V.graph.mark_buffer_mutated(other.get_name())
+            packed = QLinearPointwiseBinaryPT2E(
+                layout=NoneLayout(other.get_device()),
+                inputs=inputs,
+                constant_args=constant_args,
+                has_bias=(bias is not None),
+                x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+            )
+            # Return other since it has been inplace changed.
+            return packed.inputs[-1]
+        assert output_dtype is not None
+        if output_dtype in [torch.float32, torch.bfloat16]:
+            # in _prepare_linear_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set fp32_output, the output buf should be dtype float32 instead of uint8.
+            kernel_layout.dtype = output_dtype
+        return QLinearPointwiseBinaryPT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            has_bias=(bias is not None),
+            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+        )
+class MkldnnRnnLayer(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ) -> None:
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            op_overload=torch.ops.aten.mkldnn_rnn_layer.default,
+        )
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        w0: "TensorBox",
+        w1: "TensorBox",
+        w2: "TensorBox",
+        w3: "TensorBox",
+        hx: "TensorBox",
+        cx: "TensorBox",
+        reverse: bool,
+        batch_sizes: List[int],
+        mode: int,
+        hidden_size: int,
+        num_layers: int,
+        has_biases: bool,
+        bidirectional: bool,
+        batch_first: bool,
+        train: bool,
+    ):
+        x = cls.require_stride1(cls.realize_input(x))
+        # If batch_first, x has been permuted in lstm before entering the mkldnn_rnn_layer.
+        # Make sure x is contiguous in batch_first case.
+        x.freeze_layout()
+        w0 = cls.require_stride1(cls.realize_input(w0))
+        w1 = cls.require_stride1(cls.realize_input(w1))
+        w2 = cls.require_stride1(cls.realize_input(w2))
+        w3 = cls.require_stride1(cls.realize_input(w3))
+        hx = cls.require_stride1(cls.realize_input(hx))
+        hx.freeze_layout()
+        cx = cls.require_stride1(cls.realize_input(cx))
+        cx.freeze_layout()
+        input_size = x.get_size()
+        assert len(input_size) == 3, "Expect lstm input to be 3D"
+        # batch_first is handled in the lstm OP. When entering
+        # rnn_layer here, we'll always have batch_first = False
+        seq_length, mini_batch, input_size = input_size
+        output_shape = [seq_length, mini_batch, hidden_size]
+        hy_shape = hx.get_size()
+        cy_shape = cx.get_size()
+        res: List[IRNode] = []
+        inputs = [x, w0, w1, w2, w3, hx, cx]
+        constant_args = [
+            reverse,
+            batch_sizes,
+            mode,
+            hidden_size,
+            num_layers,
+            has_biases,
+            bidirectional,
+            batch_first,
+            train,
+        ]
+        packed = MkldnnRnnLayer(
+            MultiOutputLayout(x.get_device()),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+        def get_strides_of_lstm_output(output_shape, batch_first):
+            assert len(output_shape) == 3, "Expect output_shape to be 3D"
+            return FlexibleLayout.contiguous_strides(output_shape)
+        output_sizes = [output_shape, hy_shape, cy_shape]
+        output_strides = [
+            get_strides_of_lstm_output(output_shape, batch_first),
+            FlexibleLayout.contiguous_strides(hy_shape),
+            FlexibleLayout.contiguous_strides(cy_shape),
+        ]
+        output_ir = [
+            MultiOutput(
+                FixedLayout(
+                    x.get_device(),
+                    x.get_dtype(),
+                    output_size,
+                    output_stride,
+                ),
+                packed,
+                [(tuple, i)],
+            )
+            for i, (output_size, output_stride) in enumerate(
+                zip(output_sizes, output_strides)
+            )
+        ]
+        return output_ir

.venv/lib/python3.11/site-packages/torch/_inductor/mkldnn_lowerings.py ADDED Viewed

	@@ -0,0 +1,1087 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import functools
+from typing import List, Optional
+import torch
+import torch.utils._pytree as pytree
+from torch._inductor.kernel.mm_common import mm_args
+from . import ir
+from .codegen.cpp_gemm_template import CppPackedGemmTemplate
+from .codegen.cpp_utils import create_epilogue_with_attr
+from .ir import TensorBox
+from .lowering import (
+    add,
+    add_needs_realized_inputs,
+    aten,
+    permute,
+    register_lowering,
+    to_dtype,
+    view,
+)
+from .select_algorithm import (
+    autotune_select_algorithm,
+    ChoiceCaller,
+    ExternKernelChoice,
+)
+from .utils import use_aten_gemm_kernels, use_cpp_packed_gemm_template, use_max_autotune
+from .virtualized import ops, V
+def register_onednn_fusion_ops():
+    if torch._C._has_mkldnn:
+        from . import mkldnn_ir
+        aten_mkldnn_linear_unary = ExternKernelChoice(
+            torch.ops.mkldnn._linear_pointwise,
+            "mkldnn::_linear_pointwise",
+            has_out_variant=False,
+            kernel_creator=mkldnn_ir.LinearUnary.create,
+        )
+        aten_mkldnn_linear_binary = ExternKernelChoice(
+            torch.ops.mkldnn._linear_pointwise.binary,
+            "mkldnn::_linear_pointwise",
+            has_out_variant=False,
+            kernel_creator=mkldnn_ir.LinearBinary.create,
+        )
+        aten_mkldnn_qlinear_unary = ExternKernelChoice(
+            torch.ops.onednn.qlinear_pointwise,
+            "onednn::qlinear_pointwise",
+            has_out_variant=False,
+            kernel_creator=mkldnn_ir.QLinearPointwisePT2E.create,
+        )
+        aten_mkldnn_qlinear_binary = ExternKernelChoice(
+            torch.ops.onednn.qlinear_pointwise.binary,
+            "onednn::qlinear_pointwise",
+            has_out_variant=False,
+            kernel_creator=mkldnn_ir.QLinearPointwiseBinaryPT2E.create,
+        )
+        cpu_needs_realized_inputs = [
+            torch.ops.mkldnn._convolution_pointwise,
+            torch.ops.mkldnn._convolution_pointwise_,
+            torch.ops.mkldnn._convolution_transpose_pointwise,
+            torch.ops.mkldnn._linear_pointwise,
+            aten.mkldnn_rnn_layer.default,
+            torch.ops.onednn.qconv2d_pointwise,
+        ]
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise)
+        def convolution_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                mkldnn_ir.ConvolutionUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise.binary)
+        def convolution_binary(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                mkldnn_ir.ConvolutionBinary.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise_.binary)
+        def convolution_binary_inplace(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                mkldnn_ir.ConvolutionBinaryInplace.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+        @register_lowering(torch.ops.mkldnn._linear_pointwise)
+        def linear_unary(
+            x: TensorBox,
+            w: TensorBox,
+            b: TensorBox,
+            attr,
+            scalars,
+            algorithm,
+            layout=None,
+        ):
+            x_size = x.get_size()
+            if len(x_size) > 2:
+                # GEMM template needs 2D input, normalize input shape here
+                x = view(x, [-1, x_size[-1]])
+            if b is not None:
+                b = ir.ExternKernel.realize_input(b)
+            choices: List[ChoiceCaller] = []
+            if use_max_autotune():
+                transposed_w = permute(w, [1, 0])
+                *_, layout, x, transposed_w = mm_args(x, transposed_w, layout=layout)
+                if use_cpp_packed_gemm_template(layout, x, transposed_w):
+                    def epilogue_creator(buf):
+                        return create_epilogue_with_attr(
+                            buf, attr, scalars=scalars, algorithm=algorithm
+                        )
+                    kwargs = dict(
+                        has_bias=b is not None,
+                        trans_w=True,
+                        epilogue_creator=None if attr == "none" else epilogue_creator,
+                    )
+                    if b is not None:
+                        kwargs["input_indices"] = [2, 0, 1]  # type: ignore[assignment]
+                    CppPackedGemmTemplate.add_choices(
+                        choices,
+                        layout,
+                        [x, w] if b is None else [x, w, b],
+                        **kwargs,  # type: ignore[arg-type]
+                    )
+            if len(choices) == 0 or use_aten_gemm_kernels():
+                kwargs = dict(attr=attr, scalars=scalars, algorithm=algorithm)
+                if b is None:
+                    kwargs["B"] = None
+                choices.append(
+                    aten_mkldnn_linear_unary.bind(
+                        [x, w] if b is None else [x, w, b],
+                        layout,
+                        **kwargs,
+                    )
+                )
+            assert w.get_name() in V.graph.constants
+            input_gen_fns = {
+                1: lambda x: V.graph.constants[x.get_name()],
+            }
+            result = autotune_select_algorithm(
+                "linear_unary",
+                choices,
+                [x, w] if b is None else [x, w, b],
+                layout,
+                input_gen_fns=input_gen_fns,
+            )
+            if len(x_size) > 2:
+                result = view(result, (*x_size[:-1], result.get_size()[-1]))
+            return result
+        @register_lowering(torch.ops.mkldnn._linear_pointwise.binary)
+        def linear_binary(
+            x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr, layout=None
+        ):
+            x_size = x.get_size()
+            if len(x_size) > 2:
+                # GEMM template needs 2D input, normalize input shape here
+                x = view(x, [-1, x_size[-1]])
+            y_size = y.get_size()
+            if len(y_size) > 2:
+                y = view(y, [-1, y_size[-1]])
+            if b is not None:
+                b = ir.ExternKernel.realize_input(b)
+            choices: List[ChoiceCaller] = []
+            if use_max_autotune():
+                transposed_w = permute(w, [1, 0])
+                *_, layout, x, transposed_w, y = mm_args(
+                    x, transposed_w, y, layout=layout
+                )
+                if use_cpp_packed_gemm_template(layout, x, transposed_w):
+                    def epilogue_creator(buf):
+                        return create_epilogue_with_attr(buf, attr, other=y)
+                    kwargs = dict(
+                        has_bias=b is not None,
+                        trans_w=True,
+                        epilogue_creator=epilogue_creator,
+                    )
+                    kwargs["input_indices"] = [0, 2, 1] if b is None else [3, 0, 2, 1]
+                    CppPackedGemmTemplate.add_choices(
+                        choices,
+                        layout,
+                        [x, y, w] if b is None else [x, y, w, b],
+                        **kwargs,  # type: ignore[arg-type]
+                    )
+            if len(choices) == 0 or use_aten_gemm_kernels():
+                kwargs = dict(attr=attr)
+                if b is None:
+                    kwargs["B"] = None
+                choices.append(
+                    aten_mkldnn_linear_binary.bind(
+                        [x, y, w] if b is None else [x, y, w, b],
+                        layout,
+                        **kwargs,
+                    )
+                )
+            assert w.get_name() in V.graph.constants
+            input_gen_fns = {
+                2: lambda x: V.graph.constants[x.get_name()],
+            }
+            result = autotune_select_algorithm(
+                "linear_binary",
+                choices,
+                [x, y, w] if b is None else [x, y, w, b],
+                layout,
+                input_gen_fns=input_gen_fns,
+            )
+            if len(x_size) > 2:
+                result = view(result, (*x_size[:-1], result.get_size()[-1]))
+            return result
+        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
+        def convolution_transpose_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                mkldnn_ir.ConvolutionTransposeUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    output_padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+        @register_lowering(aten.mkldnn_rnn_layer.default)
+        def mkldnn_rnn_layer(
+            x: TensorBox,
+            w0: TensorBox,
+            w1: TensorBox,
+            w2: TensorBox,
+            w3: TensorBox,
+            hx: TensorBox,
+            cx: TensorBox,
+            reverse: bool,
+            batch_sizes: List[int],
+            mode: int,
+            hidden_size: int,
+            num_layers: int,
+            has_biases: bool,
+            bidirectional: bool,
+            batch_first: bool,
+            train: bool,
+        ):
+            return pytree.tree_map(
+                TensorBox.create,
+                mkldnn_ir.MkldnnRnnLayer.create(
+                    x,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    hx,
+                    cx,
+                    reverse,
+                    batch_sizes,
+                    mode,
+                    hidden_size,
+                    num_layers,
+                    has_biases,
+                    bidirectional,
+                    batch_first,
+                    train,
+                ),
+            )
+        @register_lowering(torch.ops.onednn.qconv2d_pointwise, type_promotion_kind=None)
+        def qconvolution_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                mkldnn_ir.QConvPointWisePT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+        @register_lowering(
+            torch.ops.onednn.qconv2d_pointwise.binary, type_promotion_kind=None
+        )
+        def qconvolution_binary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            accum: TensorBox,
+            accum_scale,
+            accum_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithmm,
+        ):
+            if (
+                binary_attr == "sum"
+                and output_dtype in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() != output_dtype
+            ):
+                # For int8-mixed-bf16 quantization and inplace add,
+                # there is case when accum dtype is float32 but output dtype is bfloat16.
+                # Since the accum will be inplaced changed with post op sum,
+                # we will do accum dtype convertion here.
+                accum = to_dtype(accum, output_dtype)
+            return TensorBox.create(
+                mkldnn_ir.QConvPointWiseBinaryPT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    accum,
+                    accum_scale,
+                    accum_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    binary_attr,
+                    alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithmm,
+                )
+            )
+        @register_lowering(torch.ops.onednn.qlinear_pointwise, type_promotion_kind=None)
+        def qlinear_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            o_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+            layout=None,
+        ):
+            x_size = x.get_size()
+            if len(x_size) > 2:
+                # GEMM template needs 2D input, normalize input shape here
+                x = view(x, [-1, x_size[-1]])
+            if not isinstance(x_scale, ir.TensorBox):
+                assert type(x_scale) == float
+                x_scale = V.graph.add_tensor_constant(
+                    torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
+                )
+            else:
+                x_scale.realize()
+            if not isinstance(x_zp, ir.TensorBox):
+                assert type(x_zp) == int
+                x_zp = V.graph.add_tensor_constant(
+                    torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
+                )
+            else:
+                x_zp.realize()
+            # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
+            # Refer to https://github.com/pytorch/pytorch/blob
+            # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
+            w_scale.realize()
+            w_zp.realize()
+            if w_zp.get_dtype() != torch.int32 and isinstance(
+                ir.InputsKernel.unwrap_storage_for_input(w_zp),
+                ir.ConstantBuffer,
+            ):
+                # W_zp might be a ConstantBuffer with int64, convert it to int32
+                w_zp_tensor = V.graph.constants[w_zp.get_name()].to(torch.int32)
+                w_zp = V.graph.add_tensor_constant(
+                    torch.tensor(w_zp_tensor, dtype=torch.int32), name=w_zp.get_name()
+                )
+            bias_dtype = None if bias is None else bias.get_dtype()
+            choices: List[ChoiceCaller] = []
+            if use_max_autotune():
+                *_, layout, x, packed_weight = mm_args(
+                    x, packed_weight, layout=layout, out_dtype=output_dtype
+                )
+                if (
+                    isinstance(
+                        ir.InputsKernel.unwrap_storage_for_input(x_zp),
+                        ir.ConstantBuffer,
+                    )
+                    and len(x_zp.get_layout().size) == 0  # Per tensor quant of act
+                    and isinstance(
+                        ir.InputsKernel.unwrap_storage_for_input(w_zp),
+                        ir.ConstantBuffer,
+                    )
+                    and torch.equal(
+                        torch.zeros_like(V.graph.constants[w_zp.get_name()]),
+                        V.graph.constants[w_zp.get_name()],
+                    )  # We only compensate MatrixB and assume B_zp is 0 to avoid the compensation of MatrixA
+                    and use_cpp_packed_gemm_template(layout, x, packed_weight)
+                ):
+                    W_tensor = V.graph.constants[packed_weight.get_name()].to_dense()
+                    weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
+                    weight_compens = V.graph.add_tensor_constant(
+                        weight_compens_tensor,
+                        name=packed_weight.get_name() + "_BMatrixCompens",
+                    )
+                    def epilogue_creator(input_buffer):
+                        # Epilogue to convert from s32 to f32 for u8s8f32
+                        assert output_dtype in [
+                            torch.float32,
+                            torch.bfloat16,
+                            torch.uint8,
+                        ]
+                        input_loader = input_buffer.make_loader()
+                        weight_compens_loader = weight_compens.make_loader()
+                        x_scale_loader = x_scale.make_loader()
+                        w_scale_loader = w_scale.make_loader()
+                        x_zp_loader = x_zp.make_loader()
+                        nonlocal bias
+                        bias_loader = None
+                        if bias is not None:
+                            bias_loader = bias.make_loader()
+                        def inner_fn(index):
+                            nonlocal bias
+                            input = input_loader(index)
+                            # MicroKernel Output is with int32
+                            # cvt to FP32 before doing compensation
+                            input = ops.to_dtype(input, torch.float32)
+                            weight_compens_index = (index[-1],)
+                            _x_scale = x_scale_loader(())
+                            _x_zp = x_zp_loader(())
+                            _w_scale = w_scale_loader(weight_compens_index)
+                            _weight_compo = weight_compens_loader(weight_compens_index)
+                            # Step 1: Doing compensation to cvt fp32
+                            temp = ops.mul(
+                                ops.mul(
+                                    input,
+                                    _x_scale,
+                                ),
+                                _w_scale,
+                            )
+                            temp = ops.sub(
+                                temp,
+                                ops.mul(
+                                    ops.mul(
+                                        ops.mul(
+                                            _x_scale,
+                                            _w_scale,
+                                        ),
+                                        _x_zp,
+                                    ),
+                                    _weight_compo,
+                                ),
+                            )
+                            # Step 2: add Bias if applicable
+                            if bias is not None:
+                                _bias = bias_loader(weight_compens_index)
+                                nonlocal bias_dtype
+                                assert bias_dtype in [torch.float32, torch.bfloat16]
+                                if bias_dtype == torch.bfloat16:
+                                    _bias = ops.to_dtype(_bias, torch.float32)
+                                temp = ops.add(temp, _bias)
+                            return temp
+                        output_buf = ir.Pointwise(
+                            device=input_buffer.get_device(),
+                            dtype=torch.float32,  # Hardcode to FP32 for u8s8f32
+                            inner_fn=inner_fn,
+                            ranges=input_buffer.get_size(),
+                        )
+                        # Step 3: Doing the unary post op fusion
+                        if attr != "none":
+                            output_buf = create_epilogue_with_attr(
+                                output_buf, attr, scalars=scalars, algorithm=algorithm
+                            )
+                        # Step 4: Cast output to Target Dtype
+                        if output_dtype == torch.bfloat16:
+                            output_cast_loader = output_buf.make_loader()
+                            def inner_fn_cast_output_to_bf16(index):
+                                input = output_cast_loader(index)
+                                return ops.to_dtype(input, output_dtype)
+                            output_buf = ir.Pointwise(
+                                device=output_buf.get_device(),
+                                dtype=output_dtype,
+                                inner_fn=inner_fn_cast_output_to_bf16,
+                                ranges=output_buf.get_size(),
+                            )
+                        elif output_dtype == torch.uint8:
+                            from .lowering import _create_constants
+                            requant_input_loader = output_buf.make_loader()
+                            def inner_fn_requant(index, scale, zero_point):
+                                input = requant_input_loader(index)
+                                inv_scale, zero_point = _create_constants(
+                                    1.0 / scale, zero_point, dtype=torch.float32
+                                )
+                                val = ops.round(input * inv_scale) + zero_point
+                                qmin, qmax = _create_constants(
+                                    0, 255, dtype=torch.float32
+                                )
+                                clamped = ops.minimum(ops.maximum(val, qmin), qmax)
+                                return ops.to_dtype(clamped, torch.uint8)
+                            output_buf = ir.Pointwise(
+                                device=output_buf.get_device(),
+                                dtype=output_dtype,
+                                inner_fn=functools.partial(
+                                    inner_fn_requant,
+                                    scale=float(o_scale),
+                                    zero_point=int(o_zero_point),
+                                ),
+                                ranges=output_buf.get_size(),
+                            )
+                        return output_buf
+                    assert x.get_dtype() == torch.uint8
+                    CppPackedGemmTemplate.add_choices(
+                        choices,
+                        layout,
+                        [x, x_scale, x_zp, packed_weight, w_scale, w_zp]
+                        if bias is None
+                        else [x, x_scale, x_zp, packed_weight, w_scale, w_zp, bias],
+                        has_bias=bias is not None,
+                        epilogue_creator=epilogue_creator,
+                        input_indices=[0, 3, 1, 2, 4, 5]
+                        if bias is None
+                        else [6, 0, 3, 1, 2, 4, 5],
+                    )
+            if len(choices) == 0 or use_aten_gemm_kernels():
+                kwargs = dict(
+                    output_scale=o_scale,
+                    output_zero_point=o_zero_point,
+                    output_dtype=output_dtype,
+                    post_op_name=attr,
+                    post_op_args=scalars,
+                    post_op_algorithm=algorithm,
+                )
+                if bias is None:
+                    kwargs["bias"] = None
+                choices.append(
+                    aten_mkldnn_qlinear_unary.bind(
+                        (x, x_scale, x_zp, packed_weight, w_scale, w_zp)
+                        if bias is None
+                        else (x, x_scale, x_zp, packed_weight, w_scale, w_zp, bias),
+                        layout,
+                        **kwargs,
+                    )
+                )
+            assert packed_weight.get_name() in V.graph.constants
+            input_gen_fns = {
+                3: lambda x: V.graph.constants[x.get_name()],
+                4: lambda x: V.graph.constants[x.get_name()],
+                5: lambda x: V.graph.constants[x.get_name()],
+                6: lambda x: V.graph.constants[x.get_name()],  # For bias
+            }
+            result = autotune_select_algorithm(
+                "qlinear_unary",
+                choices,
+                [x, x_scale, x_zp, packed_weight, w_scale, w_zp]
+                if bias is None
+                else [x, x_scale, x_zp, packed_weight, w_scale, w_zp, bias],
+                layout,
+                input_gen_fns=input_gen_fns,
+            )
+            if len(x_size) > 2:
+                result = view(result, (*x_size[:-1], result.get_size()[-1]))
+            return result
+        @register_lowering(
+            torch.ops.onednn.qlinear_pointwise.binary, type_promotion_kind=None
+        )
+        @register_lowering(
+            torch.ops.onednn.qlinear_pointwise.binary_tensor, type_promotion_kind=None
+        )
+        def qlinear_binary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            x2: TensorBox,
+            bias: TensorBox,
+            o_scale,
+            o_zero_point,
+            output_dtype,
+            x2_scale,
+            x2_zp,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithmm,
+            layout=None,
+        ):
+            x_size = x.get_size()
+            x2_size = x2.get_size()
+            assert len(x_size) == len(x2_size)
+            if len(x_size) > 2 and binary_attr == "add":
+                # GEMM template needs 2D input, normalize input shape here
+                x = view(x, [-1, x_size[-1]])
+                x2 = view(x2, [-1, x2_size[-1]])
+            if not isinstance(x_scale, ir.TensorBox):
+                assert type(x_scale) == float
+                x_scale = V.graph.add_tensor_constant(
+                    torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
+                )
+            else:
+                x_scale.realize()
+            if not isinstance(x_zp, ir.TensorBox):
+                assert type(x_zp) == int
+                x_zp = V.graph.add_tensor_constant(
+                    torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
+                )
+            else:
+                x_zp.realize()
+            # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
+            # Refer to https://github.com/pytorch/pytorch/blob
+            # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
+            w_scale.realize()
+            w_zp.realize()
+            if w_zp.get_dtype() != torch.int32 and isinstance(
+                ir.InputsKernel.unwrap_storage_for_input(w_zp),
+                ir.ConstantBuffer,
+            ):
+                w_zp_tensor = V.graph.constants[w_zp.get_name()].to(torch.int32)
+                w_zp = V.graph.add_tensor_constant(
+                    torch.tensor(w_zp_tensor, dtype=torch.int32), name=w_zp.get_name()
+                )
+            if binary_attr == "sum":
+                if output_dtype in [
+                    torch.float32,
+                    torch.bfloat16,
+                ] and x2.get_dtype() in [torch.float32, torch.bfloat16]:
+                    if x2.get_dtype() != output_dtype:
+                        # For int8-mixed-bf16 quantization and inplace add,
+                        # there is case when accum dtype is float32 but output dtype is bfloat16.
+                        # Since the accum will be inplaced changed with post op sum,
+                        # we will do accum dtype convertion here.
+                        x2 = to_dtype(x2, output_dtype)
+                else:
+                    assert (
+                        x2.get_dtype() == output_dtype
+                    ), "dtype of accum for qlinear post op sum should be the same as output"
+            x2_dtype = x2.get_dtype()
+            bias_dtype = bias.get_dtype() if bias is not None else None
+            choices: List[ChoiceCaller] = []
+            if (
+                use_max_autotune() and binary_attr == "add"
+            ):  # <TODO> Support inplace sum fusion
+                *_, layout, x, packed_weight, x2 = mm_args(
+                    x, packed_weight, x2, layout=layout, out_dtype=output_dtype
+                )
+                if (
+                    isinstance(
+                        ir.InputsKernel.unwrap_storage_for_input(x_zp),
+                        ir.ConstantBuffer,
+                    )
+                    and len(x_zp.get_layout().size) == 0  # Per tensor quant of act
+                    and isinstance(
+                        ir.InputsKernel.unwrap_storage_for_input(w_zp),
+                        ir.ConstantBuffer,
+                    )
+                    and torch.equal(
+                        torch.zeros_like(V.graph.constants[w_zp.get_name()]),
+                        V.graph.constants[w_zp.get_name()],
+                    )  # We only compensate MatrixB and assume B_zp is 0 to avoid the compensation of MatrixA
+                    and use_cpp_packed_gemm_template(layout, x, packed_weight)
+                ):
+                    W_tensor = V.graph.constants[packed_weight.get_name()]
+                    W_tensor = W_tensor.to_dense()
+                    weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
+                    weight_compens = V.graph.add_tensor_constant(
+                        weight_compens_tensor,
+                        name=packed_weight.get_name() + "_BMatrixCompens",
+                    )
+                    def epilogue_creator(input_buffer):
+                        # Epilogue to convert from s32 to f32 for u8s8f32
+                        assert output_dtype in [
+                            torch.float32,
+                            torch.bfloat16,
+                            torch.uint8,
+                        ]
+                        input_loader = input_buffer.make_loader()
+                        x2_loader = x2.make_loader()
+                        weight_compens_loader = weight_compens.make_loader()
+                        x_scale_loader = x_scale.make_loader()
+                        w_scale_loader = w_scale.make_loader()
+                        x_zp_loader = x_zp.make_loader()
+                        nonlocal bias
+                        bias_loader = None
+                        if bias is not None:
+                            bias_loader = bias.make_loader()
+                        def inner_fn(index):
+                            nonlocal bias
+                            input = input_loader(index)
+                            _x2 = x2_loader(index)
+                            _x_scale = x_scale_loader(())
+                            _x_zp = x_zp_loader(())
+                            # MicroKernel Output is with int32
+                            # cvt to FP32 before doing compensation
+                            input = ops.to_dtype(input, torch.float32)
+                            weight_compens_index = (index[-1],)
+                            _w_scale = w_scale_loader(weight_compens_index)
+                            _weight_compens = weight_compens_loader(
+                                weight_compens_index
+                            )
+                            # Step 1: Doing compensation to cvt fp32
+                            temp = ops.mul(
+                                ops.mul(
+                                    input,
+                                    _x_scale,
+                                ),
+                                _w_scale,
+                            )
+                            temp = ops.sub(
+                                temp,
+                                ops.mul(
+                                    ops.mul(
+                                        ops.mul(
+                                            _x_scale,
+                                            _w_scale,
+                                        ),
+                                        _x_zp,
+                                    ),
+                                    _weight_compens,
+                                ),
+                            )
+                            # Step 2: add Bias if applicable
+                            if bias is not None:
+                                _bias = bias_loader(weight_compens_index)
+                                nonlocal bias_dtype
+                                assert bias_dtype in [torch.float32, torch.bfloat16]
+                                if bias_dtype == torch.bfloat16:
+                                    _bias = ops.to_dtype(_bias, torch.float32)
+                                temp = ops.add(temp, _bias)
+                            # Step 3: Binary add
+                            nonlocal x2_dtype
+                            assert x2_dtype in [torch.float32, torch.bfloat16]
+                            if x2_dtype == torch.bfloat16:
+                                _x2 = ops.to_dtype(_x2, torch.float32)
+                            temp = ops.add(temp, _x2)
+                            return temp
+                        output_buf = ir.Pointwise(
+                            device=input_buffer.get_device(),
+                            dtype=torch.float32,  # Hardcode to FP32 for u8s8f32
+                            inner_fn=inner_fn,
+                            ranges=input_buffer.get_size(),
+                        )
+                        # Step 4: Unary post op if has
+                        if unary_attr != "none":
+                            output_buf = create_epilogue_with_attr(
+                                output_buf,
+                                unary_attr,
+                                scalars=unary_scalars,
+                                algorithm=unary_algorithmm,
+                            )
+                        # Step 5: Cast output to Target Dtype
+                        if output_dtype == torch.bfloat16:
+                            output_cast_loader = output_buf.make_loader()
+                            def inner_fn_cast_output_to_bf16(index):
+                                input = output_cast_loader(index)
+                                return ops.to_dtype(input, output_dtype)
+                            output_buf = ir.Pointwise(
+                                device=output_buf.get_device(),
+                                dtype=output_dtype,
+                                inner_fn=inner_fn_cast_output_to_bf16,
+                                ranges=output_buf.get_size(),
+                            )
+                        elif output_dtype == torch.uint8:
+                            from .lowering import _create_constants
+                            requant_input_loader = output_buf.make_loader()
+                            def inner_fn_requant(index, scale, zero_point):
+                                input = requant_input_loader(index)
+                                inv_scale, zero_point = _create_constants(
+                                    1.0 / scale, zero_point, dtype=torch.float32
+                                )
+                                val = ops.round(input * inv_scale) + zero_point
+                                qmin, qmax = _create_constants(
+                                    0, 255, dtype=torch.float32
+                                )
+                                clamped = ops.minimum(ops.maximum(val, qmin), qmax)
+                                return ops.to_dtype(clamped, torch.uint8)
+                            output_buf = ir.Pointwise(
+                                device=output_buf.get_device(),
+                                dtype=torch.uint8,
+                                inner_fn=functools.partial(
+                                    inner_fn_requant,
+                                    scale=float(o_scale),
+                                    zero_point=int(o_zero_point),
+                                ),
+                                ranges=output_buf.get_size(),
+                            )
+                        return output_buf
+                    CppPackedGemmTemplate.add_choices(
+                        choices,
+                        layout,
+                        [x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2]
+                        if bias is None
+                        else [x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2, bias],
+                        has_bias=bias is not None,
+                        epilogue_creator=epilogue_creator,
+                        # Reorder bias and x2
+                        input_indices=[0, 3, 1, 2, 4, 5, 6]
+                        if bias is None
+                        else [7, 0, 3, 1, 2, 4, 5, 6],
+                    )
+            if len(choices) == 0 or use_aten_gemm_kernels():
+                kwargs = dict(
+                    output_scale=o_scale,
+                    output_zero_point=o_zero_point,
+                    output_dtype=output_dtype,
+                    other_scale=x2_scale,
+                    other_zp=x2_zp,
+                    binary_post_op=binary_attr,
+                    binary_alpha=alpha,
+                    unary_post_op=unary_attr,
+                    unary_post_op_args=unary_scalars,
+                    unary_post_op_algorithm=unary_algorithmm,
+                )
+                if bias is None:
+                    kwargs["bias"] = None
+                choices.append(
+                    aten_mkldnn_qlinear_binary.bind(
+                        (x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2)
+                        if bias is None
+                        else (x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2, bias),
+                        layout,
+                        **kwargs,
+                    )
+                )
+            assert packed_weight.get_name() in V.graph.constants
+            input_gen_fns = {
+                3: lambda x: V.graph.constants[x.get_name()],
+                4: lambda x: V.graph.constants[x.get_name()],
+                5: lambda x: V.graph.constants[x.get_name()],
+            }
+            if bias is not None:
+                input_gen_fns[7] = lambda x: V.graph.constants[x.get_name()]  # For bias
+            result = autotune_select_algorithm(
+                "qlinear_binary",
+                choices,
+                [x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2]
+                if bias is None
+                else [x, x_scale, x_zp, packed_weight, w_scale, w_zp, x2, bias],
+                layout,
+                input_gen_fns=input_gen_fns,
+            )
+            if len(x_size) > 2 and binary_attr == "add":
+                result = view(result, (*x_size[:-1], result.get_size()[-1]))
+            return result
+        if torch._C.has_mkl:
+            aten_mkl_linear = ExternKernelChoice(
+                torch.ops.mkl._mkl_linear,
+                "mkl::_mkl_linear",
+                has_out_variant=False,
+                kernel_creator=mkldnn_ir.MKLPackedLinear.create,
+            )
+            cpu_needs_realized_inputs.append(torch.ops.mkl._mkl_linear)
+            @register_lowering(torch.ops.mkl._mkl_linear)
+            def mkl_packed_linear(
+                x: TensorBox,
+                packed_w: TensorBox,
+                orig_w: TensorBox,
+                b: Optional[TensorBox],
+                batch_size,
+                *,
+                layout=None,
+            ):
+                choices: List[ChoiceCaller] = []
+                if use_max_autotune():
+                    transposed_w = permute(orig_w, [1, 0])
+                    *_, layout, x, transposed_w = mm_args(
+                        x, transposed_w, layout=layout
+                    )
+                    if use_cpp_packed_gemm_template(layout, x, transposed_w):
+                        CppPackedGemmTemplate.add_choices(
+                            choices,
+                            layout,
+                            [x, packed_w, orig_w],
+                            trans_w=True,
+                            input_indices=[0, 2],
+                        )
+                if len(choices) == 0 or use_aten_gemm_kernels():
+                    choices.append(
+                        aten_mkl_linear.bind(
+                            (x, packed_w, orig_w), layout, B=None, batch_size=batch_size
+                        )
+                    )
+                assert packed_w.get_name() in V.graph.constants
+                assert orig_w.get_name() in V.graph.constants
+                # packed_w is a mkldnn tensor which we can't generate directly
+                # so we use the weights from the original tensor in autotune.
+                input_gen_fns = {
+                    1: lambda x: V.graph.constants[x.get_name()],
+                    2: lambda x: V.graph.constants[x.get_name()],
+                }
+                result: TensorBox = autotune_select_algorithm(
+                    "packed_linear",
+                    choices,
+                    [x, packed_w, orig_w],
+                    layout,
+                    input_gen_fns=input_gen_fns,
+                )
+                if b is not None:
+                    result = add(result, b)
+                return result
+        add_needs_realized_inputs(cpu_needs_realized_inputs)
+    else:
+        pass

.venv/lib/python3.11/site-packages/torch/_inductor/package/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .package import load_package, package_aoti

.venv/lib/python3.11/site-packages/torch/_inductor/package/__pycache__/build_package.cpython-311.pyc ADDED Viewed

Binary file (532 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_inductor/package/__pycache__/package.cpython-311.pyc ADDED Viewed

Binary file (15.5 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/package/build_package.py ADDED Viewed

	@@ -0,0 +1,15 @@

+build_package_contents = """
+import os
+from pathlib import Path
+from torch._inductor.package.package import compile_so
+curr_dir = Path(__file__).parent
+aoti_files = [
+    os.path.join(root, file)
+    for root, dirs, files in os.walk(curr_dir)
+    for file in files
+]
+output_so = compile_so(curr_dir, aoti_files, curr_dir)
+"""

.venv/lib/python3.11/site-packages/torch/_inductor/package/package.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import glob
+import json
+import os
+import shlex
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+import torch
+import torch._inductor
+import torch.utils._pytree as pytree
+from torch._inductor import config, exc
+from torch._inductor.cpp_builder import BuildOptionsBase, CppBuilder
+from torch.export._tree_utils import reorder_kwargs
+from .build_package import build_package_contents
+from .pt2_archive_constants import AOTINDUCTOR_DIR, ARCHIVE_VERSION
+class PT2ArchiveWriter:
+    def __init__(self, archive_path: str) -> None:
+        self.archive_path: str = archive_path
+        self.archive_file: Optional[zipfile.ZipFile] = None
+    def __enter__(self) -> "PT2ArchiveWriter":
+        assert self.archive_file is None
+        self.archive_file = zipfile.ZipFile(
+            self.archive_path, "w", compression=zipfile.ZIP_STORED
+        )
+        self.writestr("version", str(ARCHIVE_VERSION))
+        self.writestr("archive_format", "pt2")
+        return self
+    def __exit__(self, *args) -> None:  # type: ignore[no-untyped-def]
+        assert self.archive_file is not None
+        self.archive_file.close()
+        self.archive_file = None
+        return None
+    def writestr(self, name: str, data: Union[bytes, str]) -> None:
+        assert self.archive_file is not None
+        self.archive_file.writestr(name, data)
+    def write_file(self, name: str, file_path: str) -> None:
+        """
+        Copy a file into the archive.
+        name: The destination file inside the archive.
+        file_path: The source file on disk.
+        """
+        assert Path(file_path).is_file(), f"{file_path} is not a valid file path"
+        assert self.archive_file is not None
+        self.archive_file.write(file_path, arcname=name)
+class PT2ArchiveReader:
+    def __init__(self, archive_path: str) -> None:
+        self.archive_path: str = archive_path
+        self.archive_file: Optional[zipfile.ZipFile] = None
+    def __enter__(self) -> "PT2ArchiveReader":
+        self.archive_file = zipfile.ZipFile(
+            self.archive_path, "r", compression=zipfile.ZIP_STORED
+        )
+        return self
+    def __exit__(self, *args) -> None:  # type: ignore[no-untyped-def]
+        if self.archive_file is not None:
+            self.archive_file.close()
+        return None
+    def read(self, name: str) -> bytes:
+        assert self.archive_file is not None
+        return self.archive_file.read(name)
+    def extract_to_path(self, member: str, path: str) -> str:
+        assert self.archive_file is not None
+        return self.archive_file.extract(member, path)
+    def extractall(self, path: str) -> None:
+        assert self.archive_file is not None
+        self.archive_file.extractall(path)
+    def get_file_names(self) -> List[str]:
+        assert self.archive_file is not None
+        return self.archive_file.namelist()
+def _run_command_and_check(cmd: str) -> None:
+    cmd = shlex.split(cmd)
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        raise exc.CppCompileError(cmd, e.output) from e
+def compile_so(aoti_dir: str, aoti_files: List[str], so_path: str) -> str:
+    def get_aoti_file_with_suffix(suffix: str) -> str:
+        for file in aoti_files:
+            if file.endswith(suffix):
+                return file
+        raise RuntimeError(f"Unable to find file with suffix {suffix}")
+    # Compile all the files into a .so
+    cpp_file = os.path.join(aoti_dir, get_aoti_file_with_suffix(".cpp"))
+    consts_o = os.path.join(aoti_dir, get_aoti_file_with_suffix(".o"))
+    file_name = os.path.splitext(cpp_file)[0]
+    # Parse compile flags and build the .o file
+    with open(file_name + "_compile_flags.json") as f:
+        compile_flags = json.load(f)
+    compile_options = BuildOptionsBase(**compile_flags)
+    object_builder = CppBuilder(
+        name=file_name,
+        sources=cpp_file,
+        BuildOption=compile_options,
+    )
+    compile_cmd = object_builder.get_command_line()
+    output_o = object_builder.get_target_file_path()
+    _run_command_and_check(compile_cmd)
+    # Parse linker flags and build the .so file
+    with open(file_name + "_linker_flags.json") as f:
+        linker_flags = json.load(f)
+    linker_options = BuildOptionsBase(**linker_flags)
+    so_builder = CppBuilder(
+        name=os.path.split(so_path)[-1],
+        sources=[output_o, consts_o],
+        BuildOption=linker_options,
+        output_dir=so_path,
+    )
+    link_cmd = so_builder.get_command_line()
+    output_so = so_builder.get_target_file_path()
+    _run_command_and_check(link_cmd)
+    # mmapped weights
+    serialized_weights_filename = file_name + "_serialized_weights.bin"
+    if serialized_weights_filename in aoti_files:
+        with open(serialized_weights_filename, "rb") as f_weights:
+            serialized_weights = f_weights.read()
+        with open(output_so, "a+b") as f_so:
+            so_size = f_so.tell()
+            # Page align the weights
+            f_so.write(b" " * (16384 - so_size % 16384))
+            f_so.write(serialized_weights)
+    return output_so
+def package_aoti(aoti_output_dir: str) -> str:
+    """
+    Saves the AOTInductor generated files to the PT2Archive format.
+    """
+    # Add a makefile and python script
+    build_package_filename = "build_package.py"
+    with open(os.path.join(aoti_output_dir, build_package_filename), "w") as f:
+        f.write(build_package_contents)
+    with open(os.path.join(aoti_output_dir, "Makefile"), "w") as f:
+        f.write(f"all:\n\tpython3 {build_package_filename}\n")
+    if config.aot_inductor.output_path.endswith(".so"):
+        raise RuntimeError(
+            "Unable to save package as a .so. It should be a .pt2 format or a directory."
+        )
+    elif config.aot_inductor.output_path.endswith(".pt2"):
+        # Save using the PT2 packaging format
+        # (https://docs.google.com/document/d/1jLPp8MN8Whs0-VW9PmJ93Yg02W85tpujvHrTa1pc5x8/edit#heading=h.v2y2jgnwc56a)
+        archive_path = config.aot_inductor.output_path
+        with PT2ArchiveWriter(archive_path) as archive_writer:
+            package_files = glob.glob(f"{aoti_output_dir}/*")
+            for path in package_files:
+                filename = os.path.basename(path)
+                archive_writer.write_file(f"{AOTINDUCTOR_DIR}{filename}", path)
+        return archive_path
+    else:
+        # Directly put the files into the directory, without any archiving
+        return aoti_output_dir
+def load_package(path: str, device: str) -> Callable:  # type: ignore[type-arg]
+    if path.endswith(".so"):
+        raise RuntimeError(
+            "Unable to load .so. It should be a .pt2 format or a directory."
+        )
+    elif path.endswith(".pt2"):
+        so_path = os.path.splitext(path)[0]
+        with PT2ArchiveReader(path) as archive_reader:
+            file_names = archive_reader.get_file_names()
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                archive_reader.extractall(tmp_dir)
+                file_names = archive_reader.get_file_names()
+                aoti_files = [
+                    file for file in file_names if file.startswith(AOTINDUCTOR_DIR)
+                ]
+                so_path = compile_so(tmp_dir, aoti_files, so_path)
+    else:
+        assert os.path.isdir(path), "Must specify a directory or a .pt2 file"
+        aoti_files = [
+            os.path.join(root, file)
+            for root, dirs, files in os.walk(path)
+            for file in files
+        ]
+        so_path = compile_so(path, aoti_files, path)
+    if device == "cpu":
+        runner = torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)  # type: ignore[call-arg]
+    elif device == "cuda" or device.startswith("cuda:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)  # type: ignore[assignment, call-arg]
+    else:
+        raise RuntimeError("Unsupported device " + device)
+    def optimized(*args, **kwargs):  # type: ignore[no-untyped-def]
+        call_spec = runner.get_call_spec()  # type: ignore[attr-defined]
+        in_spec = pytree.treespec_loads(call_spec[0])
+        out_spec = pytree.treespec_loads(call_spec[1])
+        flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
+        flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+        return pytree.tree_unflatten(flat_outputs, out_spec)
+    return optimized

.venv/lib/python3.11/site-packages/torch/_inductor/package/pt2_archive_constants.py ADDED Viewed

	@@ -0,0 +1,16 @@

+ARCHIVE_ROOT_NAME = "package"
+ARCHIVE_FORMAT_PATH = "archive_format"
+MODELS_DIR = "models/"
+MODELS_FILENAME_FORMAT = "models/{}.json"
+AOTINDUCTOR_DIR = "data/aotinductor/"
+WEIGHTS_DIR = "data/weights/"
+WEIGHT_FILENAME_PREFIX = "weight_"
+CONSTANTS_DIR = "data/constants/"
+TENSOR_CONSTANT_FILENAME_PREFIX = "tensor_"
+CUSTOM_OBJ_FILENAME_PREFIX = "custom_obj_"
+SAMPLE_INPUTS_DIR = "data/sample_inputs/"
+SAMPLE_INPUTS_FILENAME_FORMAT = "data/sample_inputs/{}.pt"
+EXTRA_DIR = "extra/"
+MODULE_INFO_PATH = "extra/module_info.json"
+ARCHIVE_VERSION = 0

.venv/lib/python3.11/site-packages/torch/_inductor/pattern_matcher.py ADDED Viewed

	@@ -0,0 +1,2005 @@

+# mypy: allow-untyped-decorators
+"""
+# Inductor Pattern Matcher
+The pattern matcher enables search/replace within an FX graph.
+The main entrypoint to the pattern matcher is register_replacement(). Given a
+search function and a replacement function this will register a replacement with
+a pass (such as torch._inductor.fx_passes.joint_graph.patterns).
+Internally the pattern matcher represents patterns as a graph (a DAG). Creating
+new patterns manually as a graph is cumbersome and error-prone so the standard
+way to create patterns (using register_replacement()) is to provide a search
+function and a replacement function which is traced and converted into a graph.
+Because the search functions are built somewhat generic (they tend to ignore
+tensor sizes, for example) register_replacement() allows you to specify an
+`extra_check` function which performs additional checks to verify that the
+matched pattern fully matches before returning it.
+## Precompiled Patterns
+New patterns are added using register_replacement(). Patterns added in this way
+can have a compile-time overhead because they need to be traced before
+use. Patterns can be precompiled and added using gen_register_replacement()
+instead. To do this you call gen_register_replacement() instead of
+register_replacement(). The arguments are the same except for an additional
+unique name which is used as a lookup key.
+## Internals
+The match DAG is represented by a graph of `PatternExpr` nodes. Each PatternExpr
+implements a `_match` method which returns either a `Match` object for a
+successful match or a `FailedMatch` object for a failure to match.
+"""
+from __future__ import annotations
+import contextlib
+import dataclasses
+import functools
+import importlib
+import inspect
+import itertools
+import logging
+import operator
+import os
+import re
+import textwrap
+import typing
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Mapping,
+    NoReturn,
+    Optional,
+    Protocol,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+from typing_extensions import Self, TypeGuard
+import torch
+import torch._guards
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import counters
+from torch._inductor.config import trace as trace_config
+from torch._prims_common import is_integer_dtype
+from torch._subclasses.fake_tensor import unset_fake_temporarily
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from torch.fx.passes.graph_transform_observer import GraphTransformObserver
+from .._functorch import config as functorch_config
+from .._functorch.aot_autograd import aot_function, make_boxed_func
+from .._functorch.partitioners import default_partition
+from .._subclasses import FakeTensor, FakeTensorMode
+from ..fx import Transformer
+from . import config
+from .decomposition import select_decomp_table
+from .lowering import fallback_node_due_to_unsupported_type
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+Constant = Any
+NodeOrConstant = Union[Constant, torch.fx.Node]
+class SearchFn(Protocol):
+    __name__: str
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        ...
+class ReplaceFn(Protocol):
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        ...
+class TraceFn(Protocol):
+    def __call__(
+        self, fn: Union[SearchFn, ReplaceFn], *args: Any, **kwargs: Any
+    ) -> torch.fx.GraphModule:
+        ...
+T = TypeVar("T")
+# What's a better name for this?
+FnsType = Union[torch.fx.node.Target, str]
+class Multiple:
+    def __init__(self) -> None:
+        # Ensure we're really a singleton.
+        assert "MULTIPLE" not in globals() or self is MULTIPLE
+# Sentinel indicating multiple quantities can be matched
+MULTIPLE = Multiple()
+class Match:
+    """
+    Represents a successfully matched pattern.
+    The `Match` object is returned to represent a successfully matched
+    pattern. Included in the Match are the pattern that was matched, the graph
+    nodes matched, and any args that were used during the matching.
+    The args and kwargs are specific to the type of pattern that was matched and
+    provide hints about what was matched.
+    """
+    pattern: PatternExpr
+    args: List[Any]
+    kwargs: Dict[str, Any]
+    nodes: List[torch.fx.Node]
+    targets: Dict[_TargetExpr, torch.fx.node.Target]
+    ctx: MatchContext
+    replacement_graph: Optional[torch.fx.Graph]
+    def __init__(
+        self,
+        ctx: MatchContext,
+        pattern: PatternExpr,
+        args: Optional[Sequence[Any]] = None,
+        kwargs: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        super().__init__()
+        self.pattern = pattern
+        # The input nodes that must be passed in to the result
+        self.args = list(args or [])
+        self.kwargs = kwargs or {}
+        # The nodes matched in this expression
+        self.nodes = []
+        # Mapping CallFunction to the node.target
+        self.targets = {}
+        self.ctx = ctx
+        self.replacement_graph = None
+    @property
+    def graph(self) -> torch.fx.Graph:
+        return self.ctx.graph
+    def extend(self, other: Match) -> None:
+        if self.kwargs:
+            for key in set(self.kwargs.keys()) & set(other.kwargs.keys()):
+                if self.kwargs[key] != other.kwargs[key]:
+                    raise FailedMatch("kwarg mismatch: {}", key)
+        self.args.extend(other.args)
+        self.nodes.extend(other.nodes)
+        self.kwargs.update(other.kwargs)
+        self.targets.update(other.targets)
+    def bundle(self) -> Match:
+        # Wrap args in an extra list
+        self.args = [tuple(self.args)] if self.args else []
+        return self
+    def __repr__(self) -> str:
+        return f"Match(..., {self.args}, {self.kwargs})"
+    def erase_nodes(self) -> None:
+        graph = self.graph
+        for n in reversed(self.nodes):
+            if not n._erased and not n.users:
+                graph.erase_node(n)
+    def output_nodes(self) -> List[Optional[torch.fx.Node]]:
+        return [
+            (self.ctx.pattern_to_node[p] if p is not None else None)
+            for p in self.ctx.outputs
+        ]
+    def output_node(self) -> torch.fx.Node:
+        return next(p for p in self.output_nodes() if p)
+    def replace_with_graph(
+        self, replacement_graph: torch.fx.Graph, args: Sequence[Any]
+    ) -> None:
+        ReplacementPatternEntry.replace_with_graph(
+            self, self.ctx.graph, replacement_graph, args
+        )
+    def replace_by_example(
+        self,
+        replacement_fn: ReplaceFn,
+        args: Sequence[Any],
+        trace_fn: Optional[TraceFn] = None,
+        run_functional_passes: bool = True,
+    ) -> None:
+        """Replace with a graph generated by tracing the replacement_fn.
+        Args:
+            run_functional_passes (bool). If we should run passes that
+                assume functional IR (like DCE, remove_noop_ops), on the
+                replacement graph.
+        """
+        from torch._inductor.virtualized import NullHandler, V
+        context = (
+            V.fake_mode
+            if (not isinstance(V.fake_mode, NullHandler) or (V.fake_mode is None))
+            else contextlib.nullcontext()
+        )
+        with context:
+            if trace_fn is None:
+                trace_fn = functools.partial(
+                    fwd_only, run_functional_passes=run_functional_passes
+                )
+            replacement = trace_fn(
+                replacement_fn, torch.fx.map_arg(args, lambda arg: arg.meta["val"])  # type: ignore[arg-type]
+            )
+            ReplacementPatternEntry.replace_with_graph(
+                self,
+                self.ctx.graph,
+                replacement,
+                args,
+            )
+class FailedMatch(RuntimeError):
+    """
+    Represents a unsuccessful match.
+    The `FailedMatch` object is returned to represent a failure to match a
+    pattern.
+    """
+    format_string: str
+    def __init__(self, format_string: str, *args: Any, **kwargs: Any) -> None:
+        self.format_string = format_string
+        # We want to construct error messages lazily instead of eagerly, as
+        # constructing them eagerly can significantly worsen compile times.
+        if len(format_string) > 200:
+            raise RuntimeError(
+                f"Format string too long - use lazy construction of strings instead. Format string is\n {format_string}"
+            )
+        self.args = args
+        self.kwargs = kwargs
+    def __str__(self) -> str:
+        return self.format_string.format(*self.args, **self.kwargs)
+    def __bool__(self) -> bool:
+        return False
+MatchResult = Union[Match, FailedMatch]
+def is_match(m: MatchResult) -> TypeGuard[Match]:
+    """
+    TypeGuards cannot act on `self`. Thus this function exists to let mypy
+    recognize FailedMatch.__bool__ as a TypeGuard.
+    """
+    return bool(m)
+class MatchContext:
+    """
+    Internal state needed while running PatternExpr._match().
+    """
+    outputs: List[Optional[PatternExpr]]
+    pattern_to_node: Dict[PatternExpr, Optional[torch.fx.Node]]
+    graph: torch.fx.Graph
+    exclusive_node_set: List[NodeOrConstant]
+    def __init__(
+        self,
+        outputs: List[Optional[PatternExpr]],
+        pattern_to_node: Optional[Dict[PatternExpr, torch.fx.Node]] = None,
+        *,
+        graph: torch.fx.Graph,
+    ) -> None:
+        self.outputs = outputs
+        self.pattern_to_node = {} if pattern_to_node is None else dict(pattern_to_node)
+        self.graph = graph
+        self.exclusive_node_set = []
+    def match(self, pattern: PatternExpr, node: NodeOrConstant) -> MatchResult:
+        """wrapper to check reused nodes in patterns"""
+        if pattern in self.pattern_to_node:
+            if self.pattern_to_node[pattern] == node:
+                return Match(self, pattern)  # already checked this node
+            else:
+                return FailedMatch("repeated pattern differs")
+        m = pattern._match(node, self)
+        assert pattern not in self.pattern_to_node
+        self.pattern_to_node[pattern] = node if m else None
+        return m
+    def filter_multi_user_patterns(self) -> Dict[PatternExpr, torch.fx.Node]:
+        return {
+            pattern: node
+            for pattern, node in self.pattern_to_node.items()
+            if pattern.has_multiple_users() and node is not None
+        }
+class PatternExpr(ABC):
+    """
+    Base class for types of patterns.
+    """
+    @abstractmethod
+    def _match(self, node: torch.fx.Node, ctx: MatchContext) -> MatchResult:
+        ...
+    def match(self, node: torch.fx.Node) -> MatchResult:
+        try:
+            return MatchContext([self], graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+    def has_multiple_users(self) -> bool:
+        return False
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "()"
+    def find_anchor_nodes(
+        self, ctx: MatchContext, searched: Set[torch.fx.Node]
+    ) -> Generator[Optional[torch.fx.Node], None, None]:
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+    def pattern_eq(self, other: Any) -> bool:
+        """
+        Compare two `PatternExpr`s and return true if they are the
+        same. Note this is NOT matching a pattern - it is comparing the pattern
+        structures (for debugging).
+        """
+        return isinstance(other, self.__class__)
+class Arg(PatternExpr):
+    """
+    Capture an arg which will become an input to the handler.  Args are
+    passed in depth first order.
+    """
+    def _match(self, node: NodeOrConstant, ctx: MatchContext) -> MatchResult:
+        return Match(ctx, self, args=[node])  # matches anything
+class Ignored(PatternExpr):
+    """
+    Match an arg, but don't pass it to handler
+    """
+    def _match(self, node: NodeOrConstant, ctx: MatchContext) -> MatchResult:
+        return Match(ctx, self)  # matches anything
+    def __repr__(self) -> str:
+        return "*"
+    def pretty_print(self, pp: PatternPrettyPrinter) -> str:
+        return "Ignored()"
+class KeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+    def __init__(self, name: str) -> None:
+        super().__init__()
+        self.name = name
+    def __repr__(self) -> str:
+        return f"KeywordArg({self.name!r})"
+    def _match(self, node: NodeOrConstant, ctx: MatchContext) -> MatchResult:
+        return Match(ctx, self, kwargs={self.name: node})  # matches anything
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.name == other.name
+class ExclusiveKeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+    name: str
+    def __init__(self, name: str) -> None:
+        super().__init__()
+        self.name = name
+    def __repr__(self) -> str:
+        return f"ExclusiveKeywordArg({self.name!r})"
+    def _match(self, node: NodeOrConstant, ctx: MatchContext) -> MatchResult:
+        if node in ctx.exclusive_node_set:
+            return FailedMatch("exclusive arg appears twice")
+        ctx.exclusive_node_set.append(node)
+        return Match(ctx, self, kwargs={self.name: node})  # matches anything
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.name == other.name
+class _TargetExpr(PatternExpr):
+    """
+    Base class for filtering match by node.target
+    """
+    fns: List[FnsType]
+    fns_set: Set[FnsType]
+    def __init__(
+        self, fns: Union[FnsType, Sequence[FnsType]], users: Union[Multiple, int] = 1
+    ) -> None:
+        super().__init__()
+        fns = [fns] if callable(fns) or isinstance(fns, str) else list(fns)
+        for fn in fns:
+            if isinstance(fn, torch._ops.OpOverloadPacket):
+                fns.extend(getattr(fn, overload) for overload in fn.overloads())
+        self.fns = fns
+        self.fns_set = set(fns)
+        self.users = users
+    @property
+    @abstractmethod
+    def op(self) -> str:
+        ...
+    def fns_repr(self) -> str:
+        first_repr = self.fns[0]
+        if not isinstance(first_repr, str):
+            first_repr = first_repr.__name__
+        if len(self.fns) > 1:
+            return f"[{first_repr}, ...]"
+        elif self.fns[0] is getattr(torch, first_repr, None):
+            return f"torch.{first_repr}"
+        elif isinstance(self.fns[0], torch._ops.OpOverload):
+            return str(self.fns[0])
+        else:
+            return first_repr
+    def __repr__(self) -> str:
+        if self.users is MULTIPLE:
+            comma_users = ", MULTIPLE"
+        elif self.users != 1:
+            comma_users = f", {self.users})"
+        else:
+            comma_users = ""
+        return f"{self.__class__.__name__}({self.fns_repr()}{comma_users})"
+    def has_multiple_users(self) -> bool:
+        return isinstance(self.users, Multiple) or self.users > 1
+    def find_anchor_nodes(
+        self, ctx: MatchContext, searched: Set[torch.fx.Node]
+    ) -> Generator[Optional[torch.fx.Node], None, None]:
+        raise NotImplementedError
+    def _match_fns(self, node: torch.fx.Node) -> bool:
+        return (
+            isinstance(node, torch.fx.Node)
+            and node.op == self.op
+            and extract_target(node) in self.fns_set
+        )
+    def _match_users(self, node: torch.fx.Node, ctx: MatchContext) -> bool:
+        return (
+            self in ctx.outputs
+            or self.users is MULTIPLE
+            or len(node.users) == self.users
+        )
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.op == other.op
+            and self.fns == other.fns
+            and self.users == other.users
+        )
+_SimpleSpec = Tuple[Any, ...]
+class _TargetArgsExpr(_TargetExpr):
+    """
+    Base class for filtering match by node.{target,args,kwargs}
+    """
+    def __init__(
+        self,
+        fns: Union[torch.fx.node.Target, str, Sequence[Any]],
+        *args: Any,
+        _users: Union[int, Multiple] = 1,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(fns, _users)
+        self.args = tuple(args)
+        self.kwargs = dict(kwargs)
+        if any(
+            isinstance(x, (dict, list, tuple))
+            for x in itertools.chain(args, kwargs.values())
+        ):
+            self.flatten = self.pytree_flatten
+        else:
+            self.flatten = self.simple_flatten
+        self.flat_args_kwargs = self.flatten(self.args, self.kwargs)
+    @staticmethod
+    def simple_flatten(
+        args: Sequence[Any], kwargs: Mapping[Any, Any]
+    ) -> Tuple[Sequence[Any], Union[_SimpleSpec, pytree.TreeSpec]]:
+        values = (*args, *kwargs.values())
+        spec = (len(args), *kwargs.keys())
+        return values, spec
+    @staticmethod
+    def pytree_flatten(
+        args: Sequence[Any], kwargs: Mapping[Any, Any]
+    ) -> Tuple[Sequence[Any], Union[_SimpleSpec, pytree.TreeSpec]]:
+        def norm_spec(s: pytree.TreeSpec) -> pytree.TreeSpec:
+            if s.type is None:
+                return s
+            mapping = {immutable_list: list, tuple: list, immutable_dict: dict}
+            return pytree.TreeSpec(
+                mapping.get(s.type, s.type),
+                s.context,
+                list(map(norm_spec, s.children_specs)),
+            )
+        flat, spec = pytree.tree_flatten([args, kwargs])
+        spec = norm_spec(spec)
+        return flat, spec
+    def __repr__(self) -> str:
+        args = [
+            self.fns_repr(),
+            *map(repr, self.args),
+            *[f"{k}={v}" for k, v in self.kwargs.items()],
+        ]
+        if self.users is MULTIPLE:
+            args.append("_users=MULTIPLE")
+        elif self.users != 1:
+            args.append(f"_users={self.users}")
+        return f"{self.__class__.__name__}({', '.join(args)})"
+    def pretty_print(self, pp: PatternPrettyPrinter) -> str:
+        args = [
+            self.fns_repr(),
+            *(pp.pretty_print(x) for x in self.args),
+            *[f"{k}={pp.pretty_print(v)}" for k, v in self.kwargs.items()],
+        ]
+        if self.users is MULTIPLE:
+            args.append("_users=MULTIPLE")
+        elif self.users != 1:
+            args.append(f"_users={self.users}")
+        joiner_str = ", "
+        return f"{self.__class__.__name__}({joiner_str.join(args)})"
+    def _match(self, node: torch.fx.Node, ctx: MatchContext) -> MatchResult:
+        if not self._match_fns(node) or len(node.args) != len(self.args):
+            return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users {}", self)
+        _args = node.args
+        _kwargs = node.kwargs
+        if len(_kwargs) < len(self.kwargs):
+            from torch.fx.operator_schemas import normalize_function
+            normalized_args_and_kwargs = normalize_function(
+                node.target, node.args, node.kwargs  # type: ignore[arg-type]
+            )
+            if normalized_args_and_kwargs is None:
+                return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+            else:
+                _args, _kwargs = normalized_args_and_kwargs
+                if len(_args) == len(self.args) and len(_kwargs) >= len(self.kwargs):
+                    _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+                else:
+                    return FailedMatch(
+                        "function_mismatch: node={}, pattern={}", node, self
+                    )
+        else:
+            _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+        node_items, node_spec = self.flatten(_args, _kwargs)
+        self_items, self_spec = self.flat_args_kwargs
+        if node_spec != self_spec:
+            return FailedMatch("args_structure {} {}", node_spec, self_spec)
+        assert len(node_items) == len(self_items)
+        m = Match(ctx, self)
+        for i, pattern, child_node in zip(itertools.count(), self_items, node_items):
+            if isinstance(pattern, PatternExpr):
+                child_match = ctx.match(pattern, child_node)
+                if not is_match(child_match):
+                    return child_match
+                m.extend(child_match)
+            elif isinstance(child_node, torch.fx.Node) or child_node != pattern:
+                return FailedMatch(
+                    "constant_args: {} {!r}!={pattern!r}", node, child_node
+                )
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        return m
+    def find_anchor_nodes(
+        self, ctx: MatchContext, searched: Set[torch.fx.Node]
+    ) -> Generator[Optional[torch.fx.Node], None, None]:
+        """
+        This is used when we are matching a pattern with multiple outputs.
+        There is a partial match (stored in ctx) and we want to walk
+        this pattern to find a connection to an already-matched node.
+        Yields candidate nodes that `self._match` might like.
+        """
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+            return
+        for pattern in self.flat_args_kwargs[0]:
+            if isinstance(pattern, PatternExpr):
+                for other_node in pattern.find_anchor_nodes(ctx, searched):
+                    if not isinstance(other_node, torch.fx.Node):
+                        continue
+                    for node in other_node.users:
+                        if node not in searched:
+                            if self._match_fns(node):
+                                yield node
+                                searched.add(node)
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.flat_args_kwargs[1] == other.flat_args_kwargs[1]
+            and all(
+                a.pattern_eq(b) if isinstance(a, PatternExpr) else a == b
+                for a, b in zip(self.flat_args_kwargs[0], other.flat_args_kwargs[0])
+            )
+        )
+class CallFunction(_TargetArgsExpr):
+    """
+    Matches a call_function node in the FX graphs: `fns[i](*args, **kwargs)`
+    """
+    op = "call_function"
+class CallMethod(_TargetArgsExpr):
+    """
+    Matches a call_method node in the FX graphs: `fns[i].method(*args, **kwargs)`
+    """
+    op = "call_method"
+class CallModule(_TargetArgsExpr):
+    """
+    Matches a call_module node in the FX graphs: `module(*args, **kwargs)`
+    """
+    op = "call_module"
+class _TargetExprVarArgs(_TargetExpr):
+    """
+    Matches a call_function node with any arguments which are passed into the pattern
+    """
+    def _match(self, node: torch.fx.Node, ctx: MatchContext) -> MatchResult:
+        if not self._match_fns(node):
+            return FailedMatch("function_mismatch")
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users")
+        m = Match(ctx, self)
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        m.args.extend(node.args)
+        m.kwargs.update(node.kwargs)
+        return m
+class CallFunctionVarArgs(_TargetExprVarArgs):
+    op = "call_function"
+class CallMethodVarArgs(_TargetExprVarArgs):
+    op = "call_method"
+class CallModuleVarArgs(_TargetExprVarArgs):
+    op = "call_module"
+class ListOf(PatternExpr):
+    """
+    Matches a repeated pattern
+    """
+    def __init__(self, pattern: PatternExpr, partial: bool = False) -> None:
+        super().__init__()
+        assert isinstance(pattern, PatternExpr)
+        self.pattern = pattern
+        self.partial = partial
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.pattern})"
+    def _match(self, node: List[torch.fx.Node], ctx: MatchContext) -> MatchResult:  # type: ignore[override]
+        if not isinstance(node, (list, tuple)) or len(node) == 0:
+            return FailedMatch("non_list")
+        m = Match(ctx, self)
+        # Propagating patterns with multiple users will ensure we don't revisit
+        # the same nodes
+        pattern_to_node = ctx.filter_multi_user_patterns()
+        matched = False
+        for i, child_node in enumerate(node):
+            child_ctx = MatchContext(
+                ctx.outputs, pattern_to_node, graph=child_node.graph
+            )
+            child_match = child_ctx.match(self.pattern, child_node)
+            pattern_to_node = child_ctx.filter_multi_user_patterns()
+            if not is_match(child_match):
+                if not self.partial:
+                    return FailedMatch("list[{}]: {}", i, child_match)
+                continue
+            matched = True
+            m.extend(child_match.bundle())
+        if not matched:
+            return FailedMatch("list: no_match")
+        return m.bundle()
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.pattern.pattern_eq(other.pattern)
+            and self.partial == other.partial
+        )
+class MultiOutputPattern(PatternExpr):
+    outputs: List[Optional[PatternExpr]]
+    def __init__(self, outputs: Sequence[Optional[PatternExpr]]) -> None:
+        super().__init__()
+        assert isinstance(outputs[0], _TargetExpr)
+        assert all(x is None or isinstance(x, PatternExpr) for x in outputs), outputs
+        self.outputs = list(outputs)
+        self.op = outputs[0].op
+    @property
+    def fns(self) -> Union[Callable[..., Any], str, Sequence[Any]]:
+        # This cast is checked above in __init__()
+        output = typing.cast(_TargetExpr, self.outputs[0])
+        return output.fns
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.outputs})"
+    def pretty_print(self, pp: PatternPrettyPrinter) -> str:
+        args = [pp.pretty_print(x) for x in self.outputs]
+        joiner_str = f",\n{'  '}"
+        str_out = f"{self.__class__.__name__}([{joiner_str.join(args)}"
+        str_out = f"{str_out}\n])"
+        return str_out
+    def _match(self, node: torch.fx.Node, ctx: MatchContext) -> MatchResult:
+        output = typing.cast(_TargetExpr, self.outputs[0])
+        m = ctx.match(output, node)
+        if not is_match(m):
+            return m
+        for pattern in self.outputs[1:]:
+            if pattern is None:
+                continue
+            child_match = self._match_from_anchors(pattern, ctx)
+            if not is_match(child_match):
+                return child_match
+            m.extend(child_match)
+        return m
+    def _match_from_anchors(
+        self, pattern: PatternExpr, ctx: MatchContext
+    ) -> MatchResult:
+        prior = dict(ctx.pattern_to_node)
+        m: MatchResult = FailedMatch("no anchor found")
+        for node in pattern.find_anchor_nodes(ctx, set()):
+            m = ctx.match(pattern, node)
+            if is_match(m):
+                return m
+            # revert any partial matches
+            ctx.pattern_to_node = dict(prior)
+        return m
+    def match(self, node: torch.fx.Node) -> MatchResult:
+        try:
+            return MatchContext(self.outputs, graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and len(self.outputs) == len(other.outputs)
+            and all(
+                a.pattern_eq(b) if isinstance(a, PatternExpr) else a == b
+                for a, b in zip(self.outputs, other.outputs)
+            )
+        )
+class RepeatedExpr(PatternExpr):
+    """
+    Checks for a repeated pattern. Useful for repeated operations after a node such as `split` or `unbind`
+    """
+    def __init__(self, inner_pattern: _TargetExpr) -> None:
+        super().__init__()
+        self.inner_pattern = inner_pattern
+        self.op = inner_pattern.op
+    @property
+    def fns(self) -> Sequence[FnsType]:
+        return self.inner_pattern.fns
+    def _match(self, node: torch.fx.Node, ctx: MatchContext) -> MatchResult:
+        m = ctx.match(self.inner_pattern, node)
+        if not is_match(m):
+            return m
+        ctx.pattern_to_node.pop(
+            self.inner_pattern,
+        )
+        # Check all anchor nodes match the pattern
+        for anchor_node in self.inner_pattern.find_anchor_nodes(ctx, set()):
+            anchor_m = MatchContext([self], graph=node.graph).match(
+                self.inner_pattern, anchor_node
+            )
+            if not is_match(anchor_m):
+                return anchor_m
+            m.extend(anchor_m)
+        return m
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.inner_pattern.pattern_eq(
+            other.inner_pattern
+        )
+class PatternPrettyPrinter:
+    """
+    Serializes Patterns to executable python.
+    XXX: currently only used and tested for fuse attention patterns. May not cover
+    all patterns.
+    """
+    def __init__(self) -> None:
+        self.namespace = torch.fx.graph._Namespace()
+        self.memoized_objs_names: Dict[PatternExpr, str] = {}
+        self.memoized_objs_pp: Dict[PatternExpr, str] = {}
+    @staticmethod
+    @functools.lru_cache(None)
+    def run(obj: PatternExpr, output_name: str = "output") -> str:
+        """
+        Serializes obj to python code with obj written out to `output_name`
+        """
+        pp = PatternPrettyPrinter()
+        assert hasattr(obj, "pretty_print")
+        out_str = obj.pretty_print(pp=pp)
+        output = []
+        for key in pp.memoized_objs_names:
+            output.append(f"{pp.memoized_objs_names[key]} = {pp.memoized_objs_pp[key]}")
+        output.append(f"{output_name} = {out_str}")
+        return "\n".join(output)
+    def pretty_print(self, obj: Any) -> str:
+        if isinstance(obj, _TargetArgsExpr):
+            if memoized_name := self.memoized_objs_names.get(obj):
+                return memoized_name
+            else:
+                return self.memoize(obj)
+        if hasattr(obj, "pretty_print"):
+            return obj.pretty_print(self)
+        return repr(obj)
+    def memoize(self, obj: _TargetArgsExpr) -> str:
+        obj_str = obj.pretty_print(self)
+        obj_name = obj.fns_repr()
+        for prefix in ("aten.", "torch.", "prims."):
+            obj_name = obj_name.replace(prefix, "")
+        tmp_name = self.namespace.create_name(obj_name, None)
+        self.memoized_objs_names[obj] = tmp_name
+        self.memoized_objs_pp[obj] = obj_str
+        return tmp_name
+class _PassDictsType(Protocol):
+    def __getitem__(self, k: Tuple[str, torch.fx.node.Target]) -> List[PatternEntry]:
+        ...
+@dataclasses.dataclass
+class PatternEntry:
+    pattern: PatternExpr
+    extra_check: Callable[[Match], bool]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> None:
+        raise NotImplementedError
+    def register(
+        self,
+        pass_dicts: Union[_PassDictsType, Sequence[_PassDictsType]],
+        target: Union[torch.fx.node.Target, None] = None,
+        prepend: bool = False,
+    ) -> None:
+        if target is None:
+            assert hasattr(self.pattern, "fns")
+            for fn in self.pattern.fns:
+                self.register(pass_dicts, fn, prepend=prepend)
+        elif isinstance(pass_dicts, (dict, PatternMatcherPass)):
+            assert hasattr(self.pattern, "op")
+            if prepend:
+                pass_dicts[(self.pattern.op, target)].insert(0, self)
+            else:
+                pass_dicts[(self.pattern.op, target)].append(self)
+        else:
+            pass_dicts = typing.cast(Sequence[_PassDictsType], pass_dicts)
+            for x in pass_dicts:
+                self.register(x, target, prepend=prepend)
+@dataclasses.dataclass
+class LoweringPatternEntry(PatternEntry):
+    handler: Callable[..., Any]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> None:
+        handler = functools.wraps(self.handler)(functools.partial(self.handler, match))
+        with graph.inserting_before(node):
+            replacement = graph.call_function(handler, tuple(match.args), match.kwargs)
+            replacement.meta.update(node.meta)
+            node.replace_all_uses_with(replacement)
+        assert match.nodes[-1] is node
+        match.erase_nodes()
+@dataclasses.dataclass
+class GraphPatternEntry(PatternEntry):
+    """
+    A pattern that runs a function on the FX graph
+    """
+    handler: Callable[..., Any]
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> None:
+        with graph.inserting_before(node):
+            self.handler(match, *match.args, **match.kwargs)
+@dataclasses.dataclass
+class ReplacementPatternEntry(PatternEntry):
+    normalize_args: Callable[..., List[Any]]
+    @staticmethod
+    def replace_with_graph(
+        match: Match,
+        graph: torch.fx.Graph,
+        replacement_graph: Union[torch.fx.Graph, torch.fx.GraphModule],
+        args: Sequence[torch.fx.Node],
+    ) -> None:
+        class Replacer(torch.fx.Interpreter):
+            call_method = None  # type: ignore[assignment]
+            call_module = None  # type: ignore[assignment]
+            get_attr = None  # type: ignore[assignment]
+            def run_node(self, node: torch.fx.Node) -> Any:
+                if node.op in ("placeholder", "output"):
+                    return super().run_node(node)
+                if node.op == "call_function":
+                    target = node.target
+                    args, kwargs = self.fetch_args_kwargs_from_env(node)
+                    result = graph.call_function(target, args, kwargs)  # type: ignore[arg-type]
+                    if "val" in node.meta and "val" not in result.meta:
+                        result.meta["val"] = node.meta["val"]
+                        if isinstance(node.meta["val"], torch.Tensor):
+                            assert "tensor_meta" in node.meta
+                            result.meta["tensor_meta"] = node.meta["tensor_meta"]
+                    return result
+                raise NotImplementedError(f"unhandled {node}")
+        output_nodes = match.output_nodes()
+        if len(output_nodes) == 1:
+            last_node = output_nodes[0]
+        else:
+            assert output_nodes[0]
+            nodes = list(output_nodes[0].graph.nodes)
+            indices = [
+                (nodes.index(n), n)
+                for n in output_nodes
+                if isinstance(n, torch.fx.Node)
+            ]
+            last_node = min(indices, key=operator.itemgetter(0))[1]
+        def percolate_tags(
+            node: torch.fx.Node,
+            tag_name: str,
+            tag_value: str,
+            input_stops: Set[torch.fx.Node],
+        ) -> None:
+            queue = [node]
+            visited = set()
+            while queue:
+                arg = queue.pop()
+                if (
+                    arg not in visited
+                    and arg not in input_stops
+                    and hasattr(arg, "meta")
+                ):
+                    visited.add(arg)
+                    arg.meta[tag_name] = tag_value
+                    queue.extend(arg.all_input_nodes)
+        with graph.inserting_before(last_node):
+            replacement = Replacer(replacement_graph).run(*args)  # type: ignore[arg-type]
+            if isinstance(replacement, torch.fx.Node):
+                replacement = [replacement]
+            def maybe_getitem(node: torch.fx.Node) -> Any:
+                if node.op != "call_function":
+                    return None
+                if node.target != operator.getitem:
+                    return None
+                assert len(node.args) == 2
+                return node.args[1]
+            def replace(
+                old: Union[torch.fx.Node, None],
+                new: Union[torch.fx.Node, Sequence[torch.fx.Node], None],
+            ) -> None:
+                if old is None:
+                    assert new is None
+                    return
+                assert isinstance(old, torch.fx.Node)
+                if new is None:
+                    old.replace_all_uses_with(None)  # type: ignore[arg-type]
+                    graph.erase_node(old)
+                    return
+                if isinstance(new, torch.fx.Node):
+                    if "val" not in new.meta:
+                        new.meta.update(old.meta)
+                    # Preserve the recompute tags in the replacement graph. We
+                    # look at the recompute tags of the original output node to
+                    # propagate the tag from the output all the way to the input
+                    # args (named as args in the replace_with_graph).
+                    # Note that this is best effort. Since patterns are from
+                    # many to many, there is no easy way to correctly map the
+                    # recomputable tags. It is possible in some scenarios that we
+                    # incorrectly tag some nodes as recomputables.
+                    for tag_name in ["recompute", "ac_graph_id"]:
+                        if tag_name in old.meta:
+                            percolate_tags(new, tag_name, old.meta[tag_name], set(args))
+                    old.replace_all_uses_with(new)
+                    graph.erase_node(old)
+                    return
+                # `new` is not a node: it's a list of nodes.
+                #
+                # This happens when we want to replace a node that has a single
+                # packed return with multiple unpacked returns. We need to do
+                # some graph surgery here.
+                #
+                # Example:
+                #   def original_graph(x):
+                #      a = op(x)
+                #      b = a[0]
+                #      c = a[1]
+                #      ...
+                #
+                # Assume that we want to replace op(x) with the graph
+                #   def new_op(x):
+                #      w = x + 1
+                #      z = x + 2
+                #      return (w, z)
+                #
+                # We need to replace `op` with the contents of `new_op`,
+                # and then rewrite a[0] to be w and a[1] to be z, as so:
+                #   def new_graph(x):
+                #     w = x + 1
+                #     z = x + 2
+                #     b = w
+                #     c = z
+                #     ...
+                old_uses = list(old.users.keys())
+                for user in old_uses:
+                    idx = maybe_getitem(user)
+                    if idx is None:
+                        raise AssertionError("can't handle")
+                    replace(user, new[idx])  # type: ignore[index]
+                graph.erase_node(old)
+            if len(output_nodes) == len(replacement):
+                for old, new in zip(output_nodes, replacement):
+                    replace(old, new)
+            else:
+                assert len(output_nodes) == 1
+                replace(output_nodes[0], replacement)
+        match.erase_nodes()
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> None:
+        assert match.replacement_graph is not None
+        self.replace_with_graph(
+            match,
+            graph,
+            match.replacement_graph,
+            self.normalize_args(*match.args, **match.kwargs),
+        )
+def _return_true(match: Match) -> bool:
+    return True
+def log_trace_failure(search_fn: Callable[..., Any], e: RuntimeError) -> None:
+    log.info(
+        "Replacement pattern %s failed to apply due to shape mismatch: %s",
+        search_fn.__name__,
+        e,
+    )
+def register_replacement(
+    search_fn: SearchFn,
+    replace_fn: ReplaceFn,
+    example_inputs: Iterable[Any],
+    trace_fn: TraceFn,
+    pass_dicts: Union[_PassDictsType, Sequence[_PassDictsType]],
+    extra_check: Callable[[Match], bool] = _return_true,
+    scalar_workaround: Union[Dict[str, Union[float, int]], None] = None,
+    exclusive_arg_names: Sequence[str] = (),
+    search_fn_pattern: Union[PatternExpr, None] = None,
+) -> bool:
+    """
+    Create a replacement rule based on example functions that get traced
+    to create patterns.  This supports both training and inference when
+    run on a joint forward+backward graph.
+    Args:
+        search_fn: traced to give original pattern
+        replace_fn: traced to give replacement graph
+        example_inputs: example inputs for initial trace
+        trace_fn: fwd_only or joint_fwd_bwd
+        pass_dict: dict of passes to register to
+        extra_check: additional check to run on match(using real shapes)
+    """
+    argnames_static = [*inspect.signature(search_fn).parameters.keys()]
+    def check_fn(match: Match) -> bool:
+        """
+        Often shapes get burned into the pattern, so our initial match ran with
+        `ignore_types=(int, ...)`.
+        Recheck the match with the correct shapes.
+        """
+        argnames = list(argnames_static)
+        for name in argnames:
+            if name not in match.kwargs:
+                raise RuntimeError(
+                    f"Not all inputs to pattern found in match.kwargs. Perhaps one "
+                    f"of the inputs is unused? argnames={argnames}, match.kwargs={match.kwargs}"
+                )
+        args = list(
+            torch.fx.map_arg(  # type: ignore[arg-type]
+                [match.kwargs[name] for name in argnames], lambda n: n.meta["val"]
+            )
+        )
+        sym_args: List[torch.SymInt] = []
+        with torch._dynamo.utils.detect_fake_mode(args):
+            for i, grad in enumerate(requires_grad):
+                if isinstance(args[i], torch.Tensor):
+                    if grad and is_integer_dtype(args[i].dtype):
+                        return False
+                    args[i] = torch.empty_strided(
+                        args[i].size(),
+                        args[i].stride(),
+                        dtype=args[i].dtype,
+                        device=args[i].device,
+                        requires_grad=grad,
+                    )
+                    for v in itertools.chain(args[i].shape, args[i].stride()):
+                        if isinstance(v, torch.SymInt) and all(
+                            guard_size_oblivious(v != a) for a in sym_args
+                        ):
+                            sym_args.append(v)
+            # If we were given a pre-traced pattern then use that instead of
+            # retracing. Note that this means the pattern has to be independent
+            # of its args.
+            specific_pattern = search_fn_pattern
+            if not specific_pattern:
+                if sym_args:
+                    # AOT Autograd and make fx will dedupe symbolic shape size
+                    # accesses of sym ints that appear as inputs
+                    # We don't want the sym_size uses to interfere with pattern matching
+                    # so we provide them as inputs.
+                    # Later, when we actually do the replacement, the symbolic shape
+                    # sizes will get re-traced and added to the graph.
+                    def search_fn_new(*args_new: Any) -> Any:
+                        return search_fn(*args_new[len(args_new) - len(args) :])
+                    try:
+                        specific_graph = trace_fn(search_fn_new, sym_args + args)
+                    except RuntimeError as e:
+                        log_trace_failure(search_fn, e)
+                        return False
+                    # correct argnames in the graph
+                    sym_arg_names = []
+                    for i, placeholder in zip(
+                        range(len(sym_args) + len(args)),
+                        specific_graph.graph.nodes,
+                    ):
+                        if i < len(sym_args):
+                            sym_arg_names.append(placeholder.target)
+                            continue
+                        with specific_graph.graph.inserting_after(placeholder):
+                            new_node = specific_graph.graph.placeholder(
+                                argnames[i - len(sym_args)]
+                            )
+                            new_node.target = new_node.name
+                            placeholder.replace_all_uses_with(new_node)
+                            specific_graph.graph.erase_node(placeholder)
+                    argnames = sym_arg_names + argnames
+                else:
+                    try:
+                        specific_graph = trace_fn(search_fn, args)
+                    except RuntimeError as e:
+                        log_trace_failure(search_fn, e)
+                        return False
+                specific_pattern = fx_to_pattern(
+                    specific_graph,
+                    argnames=argnames,
+                    exclusive_arg_names=exclusive_arg_names,
+                    scalar_workaround=scalar_workaround,
+                )
+            node = match.output_nodes()[0]
+            assert node is not None
+            specific_pattern_match = specific_pattern.match(node)
+            if is_match(specific_pattern_match) and extra_check(specific_pattern_match):
+                # trace the pattern using the shapes from the user program
+                match.replacement_graph = trace_fn(replace_fn, args)  # type: ignore[assignment]
+                return True
+            return False
+    def normalize_args(**kwargs: Any) -> List[Any]:
+        args = []
+        for name in argnames_static:
+            args.append(kwargs.pop(name))
+        for i in range(1, len(kwargs) + 1):
+            if f"tangents_{i}" not in kwargs:
+                break
+            args.append(kwargs.pop(f"tangents_{i}"))
+        assert not kwargs, f"leftover kwargs: {kwargs!r}"
+        return args
+    if trace_fn is joint_fwd_bwd:
+        # If inference mode is enabled during compilation, assume that we don't
+        # want to match on any training graph patterns
+        if torch.is_inference_mode_enabled():
+            return False
+    # TODO: Revisit the functionalize_rng_ops for lowmem dropout
+    with functorch_config.patch(functionalize_rng_ops=False):
+        requires_grad: List[bool] = [
+            isinstance(x, torch.Tensor) and x.requires_grad for x in example_inputs
+        ]
+        if search_fn_pattern is None:
+            pattern = gen_pattern(
+                search_fn,
+                example_inputs,
+                trace_fn,
+                scalar_workaround,
+                exclusive_arg_names,
+            )
+        else:
+            pattern = search_fn_pattern
+        pattern_repr = PatternPrettyPrinter.run(pattern)
+        assert pattern_repr not in _seen_patterns
+        _seen_patterns.add(pattern_repr)
+        pattern = ReplacementPatternEntry(
+            pattern=pattern,
+            extra_check=check_fn,
+            normalize_args=normalize_args,
+        )
+        pattern.register(pass_dicts)
+        return pattern.pattern
+_serialized_patterns: Set[str] = set()
+def _serialize_pattern(
+    unique_name: str,
+    search_fn: SearchFn,
+    example_inputs: Iterable[Any],
+    trace_fn: TraceFn,
+    scalar_workaround: Union[Dict[str, Union[float, int]], None],
+) -> PatternExpr:
+    def get_file_template() -> str:
+        auto_generated_msg = textwrap.dedent(
+            """\
+            # This is an auto-generated file. Please do not modify it by hand.
+            # To re-generate, run:
+            # cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+            """
+        )
+        file_template = textwrap.dedent(
+            """\
+            # mypy: ignore-errors
+            # noqa: F401, E501
+            {msg}
+            import torch
+            import torch._inductor
+            aten = torch.ops.aten
+            prims = torch.ops.prims
+            """
+        ).format(msg=auto_generated_msg)
+        pattern_matcher_imports = []
+        for name in dir(torch._inductor.pattern_matcher):
+            attr = getattr(torch._inductor.pattern_matcher, name)
+            if isinstance(attr, type) and issubclass(attr, (PatternExpr, _TargetExpr)):
+                pattern_matcher_imports.append(name)
+        formatted_imports = ",\n   ".join(pattern_matcher_imports)
+        formatted_imports = f"from torch._inductor.pattern_matcher import (\n   {formatted_imports},\n)\n"
+        return f"{file_template}{formatted_imports}"
+    if not SERIALIZED_PATTERN_PATH.is_dir():
+        raise RuntimeError(
+            f"Could not find serialized patterns directory at {SERIALIZED_PATTERN_PATH}"
+        )
+    pattern_name = search_fn.__name__
+    from torch._functorch import config as functorch_config
+    with functorch_config.patch(functionalize_rng_ops=False):
+        pattern = gen_pattern(search_fn, example_inputs, trace_fn, scalar_workaround)
+    serialized_pattern = PatternPrettyPrinter.run(pattern, output_name=unique_name)
+    if pattern_name not in _serialized_patterns:
+        write_mode = "w"
+        _serialized_patterns.add(pattern_name)
+    else:
+        write_mode = "a"
+    file_template = get_file_template()
+    with open(SERIALIZED_PATTERN_PATH / f"{pattern_name}.py", write_mode) as f:
+        if write_mode == "w":
+            f.write(file_template)
+        else:
+            f.write("\n\n")
+        f.write(serialized_pattern)
+        f.write("\n")
+    return pattern
+SERIALIZED_PATTERN_PATH = Path(__file__).parent / "fx_passes" / "serialized_patterns"
+# This is the set of serialized patterns that we've registered.  Used by
+# test_serialized_patterns_up_to_date() to ensure the patterns are up
+# to date.
+_known_precompiled_patterns: List[
+    Tuple[
+        Any,
+        Iterable[Any],
+        Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+        Any,
+        PatternExpr,
+    ]
+] = []
+def gen_register_replacement(
+    unique_name: str,
+    search_fn: SearchFn,
+    replace_fn: ReplaceFn,
+    example_inputs: Iterable[Any],
+    trace_fn: TraceFn,
+    pass_dicts: Union[_PassDictsType, Sequence[_PassDictsType]],
+    extra_check: Callable[[Match], bool] = _return_true,
+    scalar_workaround: Union[Dict[str, Union[float, int]], None] = None,
+    exclusive_arg_names: Sequence[str] = (),
+    skip_duplicates: bool = False,
+) -> None:
+    # Make sure the example_inputs is materialized.
+    example_inputs = tuple(example_inputs)
+    if "PYTORCH_GEN_PATTERNS" in os.environ:
+        pat = _serialize_pattern(
+            unique_name, search_fn, example_inputs, trace_fn, scalar_workaround
+        )
+    else:
+        pattern_name = search_fn.__name__
+        m = importlib.import_module(
+            f"torch._inductor.fx_passes.serialized_patterns.{pattern_name}"
+        )
+        if not m or not hasattr(m, unique_name):
+            log.warning(
+                "Precompiled pattern %r not found. Run torchgen/fuse/gen_patterns.py.",
+                unique_name,
+            )
+        pat = getattr(m, unique_name)
+    for arg in pytree.tree_iter(example_inputs):
+        if isinstance(arg, FakeTensor) and arg.constant is not None:
+            # This can be a problem - small fake tensors (e.g. `tensor(2)`) will
+            # hold onto their original constant value - and by stashing it here
+            # will cause a memory leak if the constant value is on GPU.
+            # Since this is just an optimization we can clear it out.
+            arg.constant = None
+    if PatternPrettyPrinter.run(pat) in _seen_patterns and skip_duplicates:
+        return
+    _known_precompiled_patterns.append(
+        (search_fn, example_inputs, trace_fn, scalar_workaround, pat)
+    )
+    register_replacement(
+        search_fn,
+        replace_fn,
+        example_inputs,
+        trace_fn,
+        pass_dicts,
+        extra_check,
+        scalar_workaround,
+        exclusive_arg_names,
+        search_fn_pattern=pat,
+    )
+@functorch_config.patch(functionalize_rng_ops=False)
+def gen_pattern(
+    search_fn: SearchFn,
+    example_inputs: Sequence[Any],
+    trace_fn: TraceFn,
+    scalar_workaround: Union[Dict[str, Union[float, int]], None] = None,
+    exclusive_arg_names: Sequence[str] = (),
+) -> PatternExpr:
+    argnames = [*inspect.signature(search_fn).parameters.keys()]
+    if scalar_workaround is None:
+        scalar_workaround = {}
+    flat_inputs = []
+    input_idx = 0  # Positional arguments index
+    for argname in argnames:
+        if argname in scalar_workaround:
+            flat_inputs.append(scalar_workaround[argname])
+        else:
+            flat_inputs.append(example_inputs[input_idx])
+            input_idx += 1
+    search_gm = trace_fn(search_fn, flat_inputs)
+    return fx_to_pattern(
+        search_gm,
+        ignore_types=(int, float, list, torch.device, torch.dtype),
+        argnames=argnames,
+        scalar_workaround=scalar_workaround,
+        exclusive_arg_names=exclusive_arg_names,
+    )
+def register_lowering_pattern(
+    pattern: PatternExpr,
+    extra_check: Callable[[Match], bool] = _return_true,
+    *,
+    pass_dict: _PassDictsType,
+    prepend: bool = False,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Register an aten to inductor IR replacement pattern.  The decorated
+    function is saved and then called a lowering time allowing direct
+    pattern to inductor IR conversion.
+    """
+    def decorator(handler: Callable[..., Any]) -> Callable[..., Any]:
+        assert callable(handler)
+        LoweringPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        handler._inductor_lowering_function = True  # type: ignore[attr-defined]
+        return handler
+    return decorator
+def register_graph_pattern(
+    pattern: PatternExpr,
+    extra_check: Callable[[Match], bool] = _return_true,
+    *,
+    pass_dict: _PassDictsType,
+    prepend: bool = False,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Register a pattern that runs a function on the FX graph, allowing
+    custom transformation code.
+    """
+    def decorator(handler: Callable[..., Any]) -> Callable[..., Any]:
+        assert callable(handler)
+        GraphPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        return handler
+    return decorator
+def is_start_of_fx_graph(graph: torch.fx.Graph, node: torch.fx.Node) -> bool:
+    # first node in the graph
+    return node is next(iter(graph.nodes))
+# match: copy_, relu_, _set_grad_enabled, manual_seed, _enter_autocast, etc
+# doesn't match: __rshift__, etc
+_mutation_op_re = re.compile(r"(?<!_)(_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_))(?!_)")
+def is_mutation_op(node: torch.fx.Node) -> bool:
+    if node.op == "call_function":
+        if _mutation_op_re.search(node.target.__name__):  # type: ignore[union-attr]
+            return True
+    elif node.op == "call_method":
+        if _mutation_op_re.search(node.target):  # type: ignore[union-attr, arg-type]
+            return True
+    return node.kwargs.get("out") is not None
+def same_mutation_regions(a: torch.fx.Node, b: torch.fx.Node) -> bool:
+    assert "mutation_region_id" in a.meta
+    assert "mutation_region_id" in b.meta
+    return a.meta["mutation_region_id"] == b.meta["mutation_region_id"]
+def get_mutation_region_id(graph: torch.fx.Graph, node: torch.fx.Node) -> int:
+    n = node
+    while "mutation_region_id" not in n.meta and not is_start_of_fx_graph(graph, n):
+        n = n.prev
+    mutation_region_id = n.meta.get("mutation_region_id", 0)
+    while n is not node:
+        n = n.next
+        if is_mutation_op(n):
+            mutation_region_id += 1
+        n.meta["mutation_region_id"] = mutation_region_id
+    return mutation_region_id
+def should_compute_mutation_region_ids(graph: torch.fx.GraphModule) -> bool:
+    return "mutation_region_id" not in next(iter(graph.nodes)).meta
+def compute_mutation_region_ids(graph: torch.fx.GraphModule) -> None:
+    mutation_region_id = 0
+    for nd in graph.nodes:
+        if is_mutation_op(nd):
+            mutation_region_id += 1
+        nd.meta["mutation_region_id"] = mutation_region_id
+class PatternMatcherPass:
+    def __init__(
+        self,
+        pass_name: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.patterns: DefaultDict[
+            Tuple[str, torch.fx.node.Target], List[PatternEntry]
+        ] = defaultdict(list)
+        self.pass_name = pass_name
+    def __getitem__(self, item: Tuple[str, torch.fx.node.Target]) -> List[PatternEntry]:
+        return self.patterns[item]
+    def apply(self, gm: torch.fx.GraphModule) -> int:
+        if not self.patterns:
+            return 0
+        if isinstance(gm, torch.fx.GraphModule):
+            graph = gm.graph
+        elif isinstance(gm, torch.fx.Graph):
+            graph = gm
+            gm = graph.owning_module
+        else:
+            raise RuntimeError(
+                f"The input to PatternMatcherPass must be a GraphModule or a Graph, but got {type(gm)}"
+            )
+        if should_compute_mutation_region_ids(graph):  # type: ignore[arg-type]
+            compute_mutation_region_ids(graph)  # type: ignore[arg-type]
+        get_mutation_region_id_partial = functools.partial(
+            get_mutation_region_id, graph
+        )
+        count = 0
+        nodes = []
+        has_call_module = False
+        for op, target in self.patterns:
+            if op == "call_module":
+                has_call_module = True
+            else:
+                nodes.append(graph.find_nodes(op=op, target=target, sort=False))
+        if has_call_module:
+            nodes.append(graph.find_nodes(op="call_module", sort=False))
+        pass_name = self.pass_name if self.pass_name is not None else "pattern_matcher"
+        with GraphTransformObserver(
+            gm, pass_name, trace_config.log_url_for_graph_xform
+        ):
+            for node in sorted(itertools.chain.from_iterable(nodes), reverse=True):
+                target = extract_target(node)
+                if node.op == "call_module":
+                    if (node.op, target) not in self.patterns:
+                        continue
+                # conservatively not applying pattern for cpu input,
+                # since some of the patterns induce codegen and split nodes.
+                # Note: we will only skip cpu compute if disable_cpp_codegen=True
+                if fallback_node_due_to_unsupported_type(node, allow_cpu_inputs=False):
+                    continue
+                for entry in self.patterns[(node.op, target)]:
+                    if node._erased:
+                        break
+                    m = entry.pattern.match(node)
+                    # pattern match crosses mutation barrier - discard
+                    if (
+                        is_match(m)
+                        and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1  # type: ignore[possibly-undefined]
+                    ):
+                        continue
+                    if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
+                        log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
+                    if is_match(m) and entry.extra_check(m):
+                        count += 1
+                        entry.apply(m, graph, node)  # type: ignore[arg-type]
+                        counters["inductor"]["pattern_matcher_count"] += 1
+                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+        return count
+    def clear(self) -> None:
+        self.patterns.clear()
+def _not_implemented(*args: Any, **kwargs: Any) -> NoReturn:
+    raise NotImplementedError
+def fx_to_pattern(
+    gm: Union[torch.fx.GraphModule, torch.fx.Graph],
+    ignore_types: Sequence[Type[Any]] = (),
+    argnames: Sequence[str] = (),
+    scalar_workaround: Union[Dict[str, Union[float, int]], None] = None,
+    exclusive_arg_names: Sequence[str] = (),
+) -> PatternExpr:
+    """
+    Convert an FX graph into a PatternExpr.  This is useful for simple
+    patterns that can only match single functions and fixed-length lists.
+    """
+    # scalar_workaround is a hack to capture dropout_p
+    # see https://github.com/pytorch/pytorch/issues/97894
+    scalar_workaround = scalar_workaround or {}
+    inv_scalar_workaround = {v: k for k, v in scalar_workaround.items()}
+    assert len(inv_scalar_workaround) == len(scalar_workaround)
+    def process_arg(x: T) -> Union[T, KeywordArg, Ignored]:
+        if isinstance(x, (float, int)) and x in inv_scalar_workaround:
+            return KeywordArg(inv_scalar_workaround[x])
+        if type(x) in ignore_types:
+            return Ignored()
+        if isinstance(x, list) and all(isinstance(y, Ignored) for y in x) and x:
+            return Ignored()
+        return x
+    argnum = itertools.count()
+    class Converter(torch.fx.Interpreter):
+        call_method = _not_implemented
+        call_module = _not_implemented
+        get_attr = _not_implemented
+        def placeholder(
+            self, target: str, args: Sequence[Any], kwargs: Mapping[str, Any]  # type: ignore[override]
+        ) -> Union[ExclusiveKeywordArg, KeywordArg]:
+            n = next(argnum)
+            if n < len(argnames):
+                name = argnames[n]
+            elif argnames:
+                assert target.startswith("tangent")
+                name = target
+            else:
+                target = re.sub(r"_\d+$", "", target)  # de-mangle arg name
+                name = target
+            if name in exclusive_arg_names:
+                return ExclusiveKeywordArg(name)
+            else:
+                return KeywordArg(name)
+        def call_function(
+            self, target: str, args: Sequence[Any], kwargs: Mapping[str, Any]  # type: ignore[override]
+        ) -> PatternExpr:
+            args, kwargs = pytree.tree_map(process_arg, (args, kwargs))
+            if list in ignore_types:
+                # Handle a burned in tensor size which are now [Ignored(), Ignored(), ...]
+                args = [process_arg(a) for a in args]
+                kwargs = {k: process_arg(a) for k, a in kwargs.items()}
+            return CallFunction(target, *args, **kwargs)
+        def run_node(self, n: torch.fx.Node) -> Any:
+            rv = super().run_node(n)
+            if n.op == "output" and isinstance(rv, tuple):
+                assert len(rv) == len(n.args[0])  # type: ignore[arg-type]
+                for r, arg in zip(rv, n.args[0]):  # type: ignore[arg-type]
+                    r.users = len(arg.users)
+            else:
+                rv.users = len(n.users)
+            return rv
+    pattern = Converter(gm).run()  # type: ignore[arg-type]
+    if not isinstance(pattern, PatternExpr):
+        return MultiOutputPattern(pytree.tree_leaves(pattern))
+    return pattern
+@torch.no_grad()
+def fwd_only(
+    fn: Callable[..., Any],
+    args: Sequence[Any],
+    *,
+    run_functional_passes: bool = True,
+    get_decomp_fn: Optional[Callable[..., Any]] = None,
+) -> torch.fx.GraphModule:
+    """Build a normalized inference graph, for use with fx_to_pattern"""
+    # TODO - look into using aot autograd, asserting no mutating ops here
+    with enable_python_dispatcher():
+        decompositions = (
+            get_decomp_fn() if get_decomp_fn is not None else select_decomp_table()
+        )
+        gm = make_fx(fn, decompositions, tracing_mode="real")(*args)
+    from .fx_passes.post_grad import remove_noop_ops
+    if run_functional_passes:
+        remove_noop_ops(gm.graph)
+        gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+@torch.enable_grad()
+def joint_fwd_bwd(fn: Callable[..., Any], args: Sequence[Any]) -> torch.fx.GraphModule:
+    """Build a normalized training graph, for use with fx_to_pattern"""
+    gm: Optional[torch.fx.GraphModule] = None
+    def record_joint_graph(
+        joint_graph: torch.fx.GraphModule, inputs: Sequence[Any], **kwargs: Any
+    ) -> Tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        nonlocal gm
+        assert not gm
+        gm = clone_graph(joint_graph)
+        return default_partition(joint_graph, inputs, **kwargs)
+    with torch._guards.tracing(None):
+        aot_function(
+            fn,
+            lambda g, i: make_boxed_func(g),
+            partition_fn=record_joint_graph,
+            decompositions=select_decomp_table(),
+            keep_inference_input_mutations=True,
+            enable_log=False,
+        )(*args)
+    assert gm
+    from .fx_passes.post_grad import remove_noop_ops
+    remove_noop_ops(gm.graph)
+    from .fx_passes.joint_graph import pointless_view
+    matcher_pass = PatternMatcherPass()
+    pattern = CallFunction(
+        torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")
+    )
+    GraphPatternEntry(
+        pattern=pattern, handler=pointless_view, extra_check=_return_true
+    ).register(matcher_pass.patterns)
+    matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+    # remove in/out specs
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]:
+    args: List[torch.fx.node.Argument] = []
+    torch.fx.map_arg((n.args, n.kwargs), args.append)
+    return args
+def stable_topological_sort(graph: torch.fx.Graph) -> None:
+    # Nodes are in exactly one of these three collections:
+    # - Nodes in `pending` are waiting to be processed (in reverse order):
+    pending = list(reversed(graph.nodes))
+    # - Nodes in `ready` have been processed and are already in the correct
+    #   order.
+    ready = set()
+    # - `waiting` is a mapping from a dependency to nodes which depend on that
+    #   dependency.
+    waiting = defaultdict(list)
+    # The cursor indicates the last processed node so we can add new nodes
+    # after it.
+    cursor = None
+    while pending:
+        node = pending.pop()
+        waiting_for = [x for x in _args(node) if x not in ready]
+        if waiting_for:
+            # We have unprocessed input nodes. Might as well wait for the last
+            # arg so an already sorted list will only recheck this node once.
+            waiting[waiting_for[-1]].append(node)
+        else:
+            ready.add(node)
+            if cursor and cursor.next is not node:
+                cursor.append(node)
+            cursor = node
+            # Mark the nodes that have been waiting for this node to finish as
+            # ready to check again.
+            pending.extend(reversed(waiting.pop(node, ())))
+    assert not waiting and len(ready) == len(graph.nodes)
+def init_once_fakemode(fn: Callable[..., Any]) -> Callable[[], Any]:
+    """Wrapper around lazy init functions in fx_passes/"""
+    @functools.lru_cache(None)
+    @functools.wraps(fn)
+    def lazy_init() -> Any:
+        counters_ref = counters["inductor"].copy()
+        with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode():
+            result = fn()
+        # clear view matches encountered during tracing
+        counters["inductor"] = counters_ref
+        return result
+    return lazy_init
+def config_flag(name: str) -> Callable[[Match], Any]:
+    """Function for extra_check to put pass behind a flag"""
+    def flag_check(match: Match) -> Any:
+        return getattr(config, name)
+    return flag_check
+def clone_graph(input_graph: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    class CopyGraph(Transformer):
+        def run_node(self, old_node: torch.fx.Node) -> torch.fx.Node:
+            new_node = super().run_node(old_node)
+            if isinstance(new_node, torch.fx.Proxy):
+                new_node.node.meta.update(old_node.meta)
+                new_node.node.name = self.new_graph._graph_namespace.create_name(
+                    old_node.name, None
+                )
+            return new_node
+    return CopyGraph(input_graph).transform()
+_seen_patterns: Set[str] = set()
+def get_arg_value(
+    node: torch.fx.Node, arg_number: int, kwarg_name: Optional[str] = None
+) -> Any:
+    return (
+        node.args[arg_number]
+        if len(node.args) > arg_number
+        else node.kwargs.get(kwarg_name)  # type: ignore[arg-type]
+    )
+def filter_nodes(nodes: Iterable[torch.fx.Node], fn: Any) -> List[torch.fx.Node]:
+    fns = [fn]
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+    return [node for node in nodes if node.target in fns]
+def extract_target(node: torch.fx.Node) -> torch.fx.node.Target:
+    """For call_function and call_method, we directly use the target function;
+    For call_module, the target is string, and we treat the module class
+     as a function.
+    """
+    if node.op == "call_module":
+        return getattr(node.graph.owning_module, node.target).__class__  # type: ignore[arg-type]
+    return node.target

.venv/lib/python3.11/site-packages/torch/_inductor/quantized_lowerings.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# mypy: allow-untyped-defs
+import logging
+import torch
+from torch._inductor.kernel.mm_common import mm_args
+from . import config as inductor_config, lowering
+from .codegen.cpp_gemm_template import CppPackedGemmTemplate
+from .codegen.cpp_utils import create_epilogue_with_attr
+from .lowering import expand, register_lowering
+from .select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    realize_inputs,
+)
+from .utils import use_aten_gemm_kernels, use_cpp_packed_gemm_template
+log = logging.getLogger(__name__)
+aten__weight_int8pack_mm = ExternKernelChoice(
+    torch._weight_int8pack_mm, "at::_weight_int8pack_mm", has_out_variant=False
+)
+quantized = torch.ops.quantized
+_quantized = torch.ops._quantized
+aten = torch.ops.aten
+def register_quantized_ops():
+    lowering.add_needs_realized_inputs(
+        [
+            quantized.max_pool2d,
+            _quantized.wrapped_fbgemm_pack_gemm_matrix_fp16,
+            _quantized.wrapped_fbgemm_linear_fp16_weight,
+        ]
+    )
+    lowering.make_fallback(quantized.max_pool2d)
+    lowering.make_fallback(_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16)
+    lowering.make_fallback(_quantized.wrapped_fbgemm_linear_fp16_weight)
+def register_woq_mm_ops():
+    @register_lowering(aten._weight_int8pack_mm, type_promotion_kind=None)
+    def int8pack_mm(input, weight, scale, *, layout=None):
+        _, _, _, layout, mat1, mat2 = mm_args(
+            input, weight, layout=layout, mat2_transposed=True
+        )
+        assert (
+            mat1.get_dtype() in [torch.bfloat16, torch.float16, torch.float]
+            and mat2.get_dtype() == torch.int8
+        )
+        aten_layout = layout
+        # options to tune from
+        choices = (
+            [aten__weight_int8pack_mm.bind((mat1, mat2, scale), aten_layout)]
+            if use_aten_gemm_kernels()
+            else []
+        )
+        # scale is applied as an epilogue, and the scale tensor is expanded (with a view op)
+        # for broadcasting, as it's 1D.
+        def _mul_epilogue(buf):
+            return create_epilogue_with_attr(
+                buf, "mul", other=realize_inputs(expand(scale, layout.size))
+            )
+        if use_cpp_packed_gemm_template(aten_layout, mat1, mat2, mat2_transposed=True):
+            CppPackedGemmTemplate.add_choices(
+                choices,
+                aten_layout,
+                [mat1, mat2, scale],
+                trans_w=True,
+                epilogue_creator=_mul_epilogue,
+            )
+        if (
+            len(choices) == 0
+            and inductor_config.autotune_fallback_to_aten
+            and not use_aten_gemm_kernels()
+        ):
+            log.warning("No choices for GEMM, using ATen backend as fallback")
+            return aten__weight_int8pack_mm.bind(
+                (mat1, mat2, scale), aten_layout
+            ).output_node()
+        return autotune_select_algorithm(
+            "_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout
+        )

.venv/lib/python3.11/site-packages/torch/_inductor/remote_cache.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from __future__ import annotations
+import json
+import os
+import typing
+from abc import abstractmethod
+from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar, Union
+from typing_extensions import override, TypeAlias
+from torch._inductor import config
+try:
+    import redis
+except ImportError:
+    redis = None  # type: ignore[assignment]
+if config.is_fbcode():
+    from rfe.scubadata.scubadata_py3 import (  # type: ignore[import-not-found]
+        Sample as Sample_,
+    )
+    Sample: TypeAlias = Sample_
+else:
+    Sample: TypeAlias = Type[object]  # type: ignore[misc,no-redef]
+_T = TypeVar("_T")
+_U = TypeVar("_U")
+class RemoteCacheBackend(Generic[_T]):
+    """
+    A backend implementation for accessing a remote/distributed cache.  Only
+    works with bytes in/out.  For structured data use a RemoteCache.
+    """
+    @abstractmethod
+    def get(self, key: str) -> Optional[_T]:
+        pass
+    @abstractmethod
+    def put(self, key: str, data: _T) -> None:
+        pass
+# Serde that encodes from _T to _U and decodes from _U to _T.
+class RemoteCacheSerde(Generic[_T, _U]):
+    @abstractmethod
+    def encode(self, data: _T) -> _U:
+        pass
+    @abstractmethod
+    def decode(self, data: _U) -> _T:
+        pass
+JsonDataTy = Optional[
+    Union[int, float, str, bool, Dict[str, "JsonDataTy"], List["JsonDataTy"]]
+]
+class RemoteCacheJsonSerde(RemoteCacheSerde[JsonDataTy, bytes]):
+    def encode(self, data: JsonDataTy) -> bytes:
+        return bytes(json.dumps(data), "ascii")
+    def decode(self, data: bytes) -> JsonDataTy:
+        return json.loads(data)
+class RemoteCachePassthroughSerde(RemoteCacheSerde[_T, _T]):
+    def encode(self, data: _T) -> _T:
+        return data
+    def decode(self, data: _T) -> _T:
+        return data
+class RemoteCache(Generic[_T]):
+    backend_override_cls: Optional[Callable[[], RemoteCacheBackend[Any]]] = None
+    def __init__(
+        self, backend: RemoteCacheBackend[_U], serde: RemoteCacheSerde[_T, _U]
+    ) -> None:
+        # Support for testing.
+        if (override_cls := self.__class__.backend_override_cls) is not None:
+            self.backend = override_cls()
+        else:
+            self.backend = backend
+        self.serde = serde
+    def get(self, key: str) -> Optional[_T]:
+        sample = self._create_sample()
+        result = self._get(key, sample)
+        self._log_sample(sample)
+        return result
+    def put(self, key: str, value: _T) -> None:
+        sample = self._create_sample()
+        self._put(key, value, sample)
+        self._log_sample(sample)
+    def _decode(self, data: _U, sample: Optional[Sample]) -> _T:
+        return self.serde.decode(data)
+    def _encode(self, value: _T, sample: Optional[Sample]) -> Any:  # returns _U
+        return self.serde.encode(value)
+    def _get(self, key: str, sample: Optional[Sample]) -> Optional[_T]:
+        if data := self.backend.get(key):
+            return self._decode(data, sample)
+        return None
+    def _put(self, key: str, value: _T, sample: Optional[Sample]) -> None:
+        data = self._encode(value, sample)
+        self.backend.put(key, data)
+    def _create_sample(self) -> Optional[Sample]:
+        return None
+    def _log_sample(self, sample: Optional[Sample]) -> None:
+        pass
+class RedisRemoteCacheBackend(RemoteCacheBackend[bytes]):
+    """
+    A Redis implementation of a remote/distributed cache.
+    """
+    _key_fmt: str
+    _redis: Optional[redis.Redis] = None
+    def __init__(self, cache_id: str) -> None:
+        if not redis:
+            # We had trouble importing redis - just skip init.
+            return
+        self._key_fmt = f"pt2:{cache_id}:{{key}}"
+        self._redis = redis.Redis(
+            host=os.environ.get("TORCHINDUCTOR_REDIS_HOST", "localhost"),
+            port=int(os.environ.get("TORCHINDUCTOR_REDIS_PORT", 6379)),
+        )
+    def __get_key(self, key: str) -> str:
+        return self._key_fmt.format(key=key)
+    @override
+    def get(self, key: str) -> Optional[bytes]:
+        if not self._redis:
+            # Either redis wasn't found or we already had some trouble...
+            return None
+        try:
+            value = self._redis.get(self.__get_key(key))
+        except redis.exceptions.ConnectionError:
+            # Redis is lazy and doesn't actually attempt to connect until the
+            # first use. Mark is as unavailable now.
+            self._redis = None
+            return None
+        # In theory redis.get() can return an Awaitable as well...
+        assert value is None or isinstance(value, bytes)
+        return value
+    @override
+    def put(self, key: str, data: bytes) -> None:
+        if not self._redis:
+            # Either redis wasn't found or we already had some trouble...
+            return
+        try:
+            self._redis.set(self.__get_key(key), data)
+        except redis.exceptions.ConnectionError:
+            # Redis is lazy and doesn't actually attempt to connect until the
+            # first use. Mark is as unavailable now.
+            self._redis = None
+class RedisRemoteCache(RemoteCache[JsonDataTy]):
+    def __init__(self, key: str) -> None:
+        # Special test handling: If we're just going to override the backend
+        # anyway don't require redis
+        if self.__class__.backend_override_cls:
+            # This is totally bogus but it works for now...
+            backend = typing.cast(RemoteCacheBackend[bytes], None)
+        else:
+            backend = RedisRemoteCacheBackend(key)
+        serde = RemoteCacheJsonSerde()
+        super().__init__(backend, serde)
+class RemoteAutotuneCache(RedisRemoteCache):
+    pass
+class RemoteFxGraphCache(RedisRemoteCache):
+    pass

.venv/lib/python3.11/site-packages/torch/_inductor/select_algorithm.py ADDED Viewed

	@@ -0,0 +1,1743 @@

+# mypy: allow-untyped-defs
+import builtins
+import contextlib
+import functools
+import inspect
+import itertools
+import json
+import logging
+import math
+import operator
+import os
+import sys
+import textwrap
+import time
+from collections import namedtuple
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from io import StringIO
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
+import sympy
+from filelock import FileLock
+import torch
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import counters, identity, preserve_rng_state
+from . import config, ir
+from .autotune_process import TensorMeta, TritonBenchmarkRequest
+from .codecache import code_hash, PersistentCache, PyCodeCache
+from .codegen.common import IndentedBuffer, KernelTemplate
+from .codegen.triton import (
+    gen_common_triton_imports,
+    texpr,
+    TritonKernel,
+    TritonPrinter,
+    TritonScheduling,
+)
+from .codegen.triton_utils import config_of, signature_to_meta
+from .exc import CUDACompileError
+from .ir import ChoiceCaller, PrimitiveInfoType
+from .runtime.benchmarking import benchmarker
+from .runtime.hints import DeviceProperties
+from .utils import (
+    FakeIndentedBuffer,
+    get_dtype_size,
+    Placeholder,
+    restore_stdout_stderr,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    unique,
+)
+from .virtualized import V
+log = logging.getLogger(__name__)
+# correctness checks struggle with fp16/tf32
+VERIFY: Dict[str, Any] = {}
+PRINT_AUTOTUNE = True
+DEBUG = False
+class KernelNamespace:
+    pass
+# these objects are imported from the generated wrapper code
+extern_kernels = KernelNamespace()
+class PartialRender:
+    """
+    Some parts of a template need to be generated at the end, but
+    inserted into the template at the start.  This allows doing a bunch
+    of replacements after the initial render.
+    """
+    def __init__(self, code, replacement_hooks) -> None:
+        super().__init__()
+        self.code = code
+        self.replacement_hooks = replacement_hooks
+    def finalize_hook(self, hook_key: str, strict=True) -> None:
+        if hook_key not in self.replacement_hooks:
+            if strict:
+                raise RuntimeError(
+                    f"{hook_key} not registered in self.replacement_hooks"
+                )
+            else:
+                return
+        assert (
+            self.replacement_hooks[hook_key] is not None
+        ), "hook_key can only be called once"
+        self.code = self.code.replace(hook_key, self.replacement_hooks[hook_key]())
+        self.replacement_hooks[hook_key] = None
+    def finalize_all(self) -> str:
+        for key, fn in self.replacement_hooks.items():
+            self.code = self.code.replace(key, fn())
+        return self.code
+# This is used to store info needed for lowering each subgraph in triton
+# templates
+SubgraphInfo = namedtuple(
+    "SubgraphInfo",
+    [
+        "body",
+        "template_mask",
+        "template_out",
+    ],
+)
+class TritonTemplateKernel(TritonKernel):
+    def __init__(
+        self,
+        kernel_name,
+        input_nodes,
+        output_node,
+        defines,
+        num_stages,
+        num_warps,
+        grid_fn,
+        meta,
+        call_sizes,
+        use_jit=False,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        subgraphs: Optional[List[ir.ComputedBuffer]] = None,
+        *,
+        index_dtype,
+    ) -> None:
+        super().__init__(
+            sympy_product(output_node.get_size()),
+            sympy.Integer(1),
+            index_dtype=index_dtype,
+        )
+        self.input_nodes = input_nodes
+        self.output_node = output_node
+        self.named_input_nodes = {}  # type: ignore[var-annotated]
+        self.defines = defines
+        self.kernel_name = kernel_name
+        self.use_jit = use_jit
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.grid_fn = grid_fn
+        self.meta = meta
+        self.call_sizes = call_sizes
+        # for templates with fixed epilogues
+        self.prefix_args = prefix_args
+        self.suffix_args = suffix_args
+        self.epilogue_fn = epilogue_fn
+        self.render_hooks = {}  # type: ignore[var-annotated]
+        self.triton_meta: Optional[Dict[str, object]] = None
+        # For Templated Attention this can be a list of ir.Subgraph
+        self.subgraphs: Optional[List[ir.ComputedBuffer]] = subgraphs
+        # The following attributes (body, template_mask, output_val) are all
+        # used for triton kernel codegen.
+        # They are swapped onto the TritonTemplateKernel object by
+        # `set_subgraph_body`
+        self.subgraph_bodies: Dict[str, SubgraphInfo] = {}
+        self.body: IndentedBuffer = FakeIndentedBuffer()
+        self.template_mask: Optional[str] = None
+        self.template_out: Optional[str] = None
+    @contextlib.contextmanager
+    def set_subgraph_body(self, body_name: str):
+        old_body, old_mask, old_out = self.body, self.template_mask, self.template_out
+        assert body_name in self.subgraph_bodies, body_name
+        self.body, self.template_mask, self.template_out = self.subgraph_bodies[
+            body_name
+        ]
+        yield
+        self.subgraph_bodies[body_name] = SubgraphInfo(
+            self.body, self.template_mask, self.template_out
+        )
+        self.body, self.template_mask, self.template_out = old_body, old_mask, old_out
+    @contextlib.contextmanager
+    def create_subgraph_body(self, body_name: str):
+        assert body_name not in self.subgraph_bodies
+        self.subgraph_bodies[body_name] = SubgraphInfo(IndentedBuffer(), None, None)
+        with self.set_subgraph_body(body_name):
+            yield
+    def need_numel_args(self):
+        return False
+    def estimate_kernel_num_bytes(self):
+        """
+        Estimate the total number of bytes this kernel takes.
+        For in/out nodes, sizes are counted twice: once for reading and
+        once for writing.
+        """
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        num_bytes = []
+        for i, inp in enumerate(itertools.chain(self.input_nodes, (self.output_node,))):
+            size = V.graph.sizevars.size_hints(inp.get_size())
+            numel = functools.reduce(operator.mul, size, 1)
+            dtype_size = get_dtype_size(inp.get_dtype())
+            num_bytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(num_bytes)
+    def jit_lines(self):
+        if self.use_jit:
+            return "@triton.jit"
+        argdefs, _, signature, _ = self.args.python_argdefs()
+        triton_meta = {
+            "signature": signature_to_meta(signature, size_dtype=self.index_dtype),
+            "device": DeviceProperties.create(self.output_node.get_device()),
+            "constants": {},
+        }
+        triton_meta["configs"] = [config_of(signature)]
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+        matrix_instr_nonkdim = self.meta.get("matrix_instr_nonkdim", 0)
+        if matrix_instr_nonkdim != 0:
+            triton_meta["matrix_instr_nonkdim"] = matrix_instr_nonkdim
+        self.triton_meta = triton_meta
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            **TritonKernel.inductor_meta_common(),
+        }
+        if config.profile_bandwidth or config.benchmark_kernel:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+        return f"""
+            @triton_heuristics.template(
+                num_stages={self.num_stages},
+                num_warps={self.num_warps},
+                triton_meta={triton_meta!r},
+                inductor_meta={inductor_meta!r},
+            )
+            @triton.jit
+        """
+    def gen_argdefs(self):
+        def hook():
+            # python_argdefs() cannot be run until after the rest of the template lazily adds more args
+            arg_defs, *_ = self.args.python_argdefs()
+            return f"{', '.join(arg_defs)}"
+        self.render_hooks["<ARGDEFS>"] = hook
+        return "<ARGDEFS>"
+    def gen_defines(self):
+        return self.defines
+    def def_kernel(self, *argnames):
+        """
+        Hook called from template code to generate function def and
+        needed args.
+        """
+        assert all(isinstance(x, str) for x in argnames)
+        renames = IndentedBuffer(initial_indent=1)
+        named_args = self.input_nodes[
+            self.prefix_args : len(self.input_nodes) - self.suffix_args
+        ]
+        assert len(argnames) == len(named_args), (
+            len(argnames),
+            len(named_args),
+            self.prefix_args,
+            len(self.input_nodes),
+        )
+        for input_node in self.input_nodes[: self.prefix_args]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+        for name, input_node in zip(argnames, named_args):
+            arg_name = f"arg_{name}"
+            self.named_input_nodes[name] = input_node
+            self.args.input_buffers[input_node.get_name()] = arg_name
+        # The args may be duplicated, so renaming must be after args are de-duplicated.
+        for name in argnames:
+            input_node = self.named_input_nodes[name]
+            arg_name = self.args.input_buffers[input_node.get_name()]
+            if input_node.get_layout().offset == 0:
+                renames.writeline(f"{name} = {arg_name}")
+            else:
+                offset = texpr(self.rename_indexing(input_node.get_layout().offset))
+                renames.writeline(f"{name} = {arg_name} + {offset}")
+        for input_node in self.input_nodes[len(self.input_nodes) - self.suffix_args :]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+        def hook():
+            # python_argdefs() cannot be run until after the rest of the template lazily adds more args
+            arg_defs, *_ = self.args.python_argdefs()
+            code = IndentedBuffer()
+            code.splice(gen_common_triton_imports())
+            code.splice(self.jit_lines())
+            code.writeline(f"def {self.kernel_name}({', '.join(arg_defs)}):")
+            with code.indent():
+                code.splice(self.defines)
+                code.splice(renames.getvalue())
+            return code.getvalue()
+        assert "<DEF_KERNEL>" not in self.render_hooks
+        self.render_hooks["<DEF_KERNEL>"] = hook
+        return "<DEF_KERNEL>"
+    def size(self, name: str, index: int):
+        """
+        Hook called from template code to get the size of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        assert isinstance(index, int)
+        if name is None:
+            val = self.output_node.get_size()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_size()[index]
+        return texpr(self.rename_indexing(val))
+    def stride(self, name, index=None):
+        """
+        Hook called from template code to get the stride of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        if name is None:
+            val = self.output_node.get_stride()
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_stride()
+        if isinstance(index, int):
+            return texpr(self.rename_indexing(val[index]))
+        else:
+            return ", ".join([texpr(self.rename_indexing(i)) for i in val])
+    def modification(
+        self, subgraph_number: int, output_name: str, **fixed_inputs
+    ) -> str:
+        """This creates a modification function for a subgraph.
+        To use this inside a template, the first argument should specify which subgraph to codegen for
+        Args:
+            subgraph_number (int): The index of the subgraph in self.subgraphs
+        """
+        num = 0
+        while f"mod_{subgraph_number}_{num}" in self.subgraph_bodies:
+            num += 1
+        with self.create_subgraph_body(f"mod_{subgraph_number}_{num}"):
+            assert isinstance(subgraph_number, int)
+            assert isinstance(self.subgraphs, list)
+            assert (
+                self.body.getvalue() == ""
+            ), "Body should be clear before adding a modification"
+            assert subgraph_number < len(
+                self.subgraphs
+            ), f"Invalid subgraph number provided to create_modification, {subgraph_number} must be < {len(self.subgraphs)}"
+            subgraph = self.subgraphs[subgraph_number]
+            def add_input(name):
+                return self.args.input(name)
+            name = f"PlaceholderSubstitution_{subgraph_number}"
+            class PlaceholderSubstitution(V.WrapperHandler):  # type: ignore[name-defined]
+                self.name = name
+                def load(self, name: str, index: sympy.Expr):
+                    if name not in fixed_inputs:
+                        # If it's not a fixed input, it's a load from a captured
+                        # tensor
+                        var = add_input(name)
+                        return f"tl.load({var} + {index})"
+                    return f"({fixed_inputs[name]})"
+                def indirect_indexing(self, index_var, size, check, wrap_neg=True):
+                    return sympy_index_symbol(str(index_var))
+            with V.set_ops_handler(PlaceholderSubstitution(V.ops)):
+                assert isinstance(
+                    subgraph, ir.ComputedBuffer
+                ), f"Expected the subgraph to be a ComputedBuffer, got {type(subgraph)}"
+                if isinstance(subgraph.data, ir.InputBuffer):
+                    out = subgraph.data.make_loader()(())
+                else:
+                    out = subgraph.data.inner_fn(())
+            self.codegen_body()
+            self.body.writeline(f"{output_name} = {out.value}")
+            body_val = self.body.getvalue()
+            self.cse.invalidate(set())  # type: ignore[arg-type]
+            return body_val
+    def store_output(
+        self,
+        indices: Union[List[Any], Tuple[Any]],
+        val: str,
+        mask: Optional[str] = None,
+        indent_width: int = 4,
+    ):
+        """Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.
+        Args:
+            indices (Union[List, Tuple]): The index for each dimension of the output. The dot product of
+                these indices and output strides must match `val`.
+            val (str): The value to store.
+            mask (Optional[str]): An optional mask to use for the store operation. If provided, this mask
+                will be applied to the store.
+            indent_width (int): The number of spaces to use for indentation. This is used when the call to
+                store_output is indented in the kernel definition.
+        """
+        with self.create_subgraph_body("<STORE_OUTPUT>"):
+            assert isinstance(indices, (list, tuple))
+            assert isinstance(val, str)
+            assert isinstance(mask, (str, type(None)))
+            assert self.template_mask is None
+            indices = list(map(TritonPrinter.paren, indices))
+            index_symbols = [sympy.Symbol(x, integer=True) for x in indices]
+            lengths = [
+                V.graph.sizevars.simplify(s) for s in self.output_node.get_size()
+            ]
+            assert len(indices) == len(lengths)
+            # glue to make generated code use same indexing from template
+            for name, range_tree_entry in zip(
+                indices, self.range_trees[0].construct_entries(lengths)
+            ):
+                range_tree_entry.set_name(name)
+            contiguous_index = sympy_dot(
+                ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
+            )
+            contiguous_index = self.rename_indexing(contiguous_index)
+            self.body.writeline("xindex = " + texpr(contiguous_index))
+            self.range_trees[0].lookup(
+                sympy.Integer(1), sympy_product(lengths)
+            ).set_name("xindex")
+            self.template_mask = mask
+            self.template_out = val
+            self.template_indices = indices
+            output_index = self.output_node.get_layout().make_indexer()(index_symbols)
+            output_index = self.rename_indexing(output_index)
+            if output_index == contiguous_index:
+                output_index = sympy.Symbol("xindex", integer=True)
+            epilogue_args = [val]
+            for input_node in itertools.chain(
+                self.input_nodes[: self.prefix_args],
+                self.input_nodes[len(self.input_nodes) - self.suffix_args :],
+            ):
+                input_node.freeze_layout()
+                epilogue_args.append(input_node.make_loader()(index_symbols))
+            V.ops.store(
+                self.output_node.get_name(),
+                output_index,
+                self.epilogue_fn(*epilogue_args),
+            )
+            self.codegen_body()
+        def hook():
+            # more stuff might have been added since the codegen_body above
+            self.codegen_body()
+            return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
+        assert "<STORE_OUTPUT>" not in self.render_hooks
+        self.render_hooks["<STORE_OUTPUT>"] = hook
+        return "<STORE_OUTPUT>"
+    def render(self, template, kwargs):
+        return PartialRender(
+            template.render(**self.template_env(), **kwargs),
+            self.render_hooks,
+        )
+    def make_load(self, name, indices, mask):
+        """
+        Optional helper called from template code to generate the code
+        needed to load from an tensor.
+        """
+        assert isinstance(indices, (list, tuple))
+        assert isinstance(name, str)
+        assert isinstance(mask, str)
+        stride = self.named_input_nodes[name].get_stride()
+        indices = list(map(TritonPrinter.paren, indices))
+        assert len(indices) == len(stride)
+        index = " + ".join(
+            f"{texpr(self.rename_indexing(s))} * {i}" for s, i in zip(stride, indices)
+        )
+        return f"tl.load({name} + ({index}), {mask}, other=0.0)"
+    def template_env(self):
+        """
+        Generate the namespace visible in the template.
+        """
+        return {
+            fn.__name__: fn
+            for fn in [
+                self.def_kernel,
+                self.size,
+                self.stride,
+                self.store_output,
+                self.make_load,
+                self.modification,
+                self.gen_argdefs,
+                self.gen_defines,
+            ]
+        }
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        dense_indexing=False,
+        copy_shape=None,
+        override_mask=None,
+        block_ptr=False,
+    ):
+        """
+        Override the default indexing to use our custom mask and force
+        dense indexing.
+        """
+        return super().indexing(
+            index,
+            dense_indexing=False,
+            # We pass template_out as the shape to broadcast the indexing to as
+            # the mask might be broadcast to the output shape
+            copy_shape=self.template_out,
+            override_mask=self.template_mask,
+            block_ptr=block_ptr,
+        )
+    def codegen_range_tree(self):
+        pass  # ignore default codegen
+    def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        _, call_args, _, arg_types = self.args.python_argdefs()
+        if V.graph.cpp_wrapper:
+            # In the cpp_wrapper case, we have to compute CUDA launch grid at runtime
+            # if any dynamic dimension is involved. We rely on the Python version
+            # of the grid function to generate those grid configs, which may contain
+            # symbolic values. The wrapper will use cexpr to print out C++ code
+            # appropriately for the grid configs.
+            grid = self.call_sizes + [self.meta]
+            wrapper.generate_kernel_call(
+                name,
+                call_args,
+                grid=self.grid_fn(*grid),
+                arg_types=arg_types,
+                triton_meta=self.triton_meta,
+            )
+        else:
+            wrapper.add_import_once(f"import {self.grid_fn.__module__}")
+            meta = wrapper.add_meta_once(self.meta)
+            grid = self.call_sizes + [meta]
+            wrapper.generate_kernel_call(
+                name,
+                call_args,
+                grid=grid,
+                grid_fn=f"{self.grid_fn.__module__}.{self.grid_fn.__name__}",
+                arg_types=arg_types,
+                triton_meta=self.triton_meta,
+            )
+@functools.lru_cache(None)
+def _jinja2_env():
+    try:
+        import jinja2
+        return jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+        )
+    except ImportError:
+        return None
+class TritonTemplate(KernelTemplate):
+    index_counter = itertools.count()
+    all_templates: Dict[str, "TritonTemplate"] = {}
+    def __init__(self, name: str, grid: Any, source: str, debug=False) -> None:
+        super().__init__(name)
+        self.grid = grid
+        self.template = self._template_from_string(source)
+        assert name not in self.all_templates, "duplicate template name"
+        self.all_templates[name] = self
+        self.debug = debug
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes,
+        layout,
+        num_stages,
+        num_warps,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        subgraphs=None,
+        mutated_inputs=None,
+        call_sizes=None,
+        **kwargs,
+    ):
+        """This function generates a TritonTemplateCaller
+        Args:
+            input_nodes: List of input nodes
+            layout: Output layout
+            num_stages: Number of stages for triton launch
+            num_warps: Number of warps for triton launch
+            prefix_args: Number of input nodes to be passed as arguments
+            suffix_args: Number of input nodes to be passed as arguments
+            epilogue_fn: Optional epilogue function to be called on the output
+            subgraphs: Optional subgraphs to be passed as arguments, these will be inlined
+                into the triton template string
+            mutated_inputs: Optional list of input nodes that are mutated by the kernel, this is helpful
+                if you need to return multiple outputs. You can pass them as inputs and mark them as
+                being mutated by the kernel.
+        """
+        assert self.template, "requires jinja2"
+        defines = StringIO()
+        for name, val in kwargs.items():
+            defines.write(f"{name} : tl.constexpr = {val}\n")
+        defines = defines.getvalue()
+        fake_out = ir.Buffer("buf_out", layout)
+        kernel_name = f"triton_{self.name}"
+        numel = sympy_product(layout.size)
+        buffers = itertools.chain(input_nodes, (fake_out,))
+        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            raise NotImplementedError(
+                "64-bit indexing is not yet implemented for triton templates"
+            )
+        if call_sizes is None:
+            call_sizes = layout.size
+        kernel_options = dict(
+            input_nodes=input_nodes,
+            defines=defines,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            grid_fn=self.grid,
+            meta=kwargs,
+            call_sizes=call_sizes,
+            prefix_args=prefix_args,
+            suffix_args=suffix_args,
+            epilogue_fn=epilogue_fn,
+            index_dtype="tl.int32",
+            subgraphs=subgraphs,
+        )
+        with patch.object(
+            V.graph, "get_dtype", self._fake_get_dtype(fake_out)
+        ), TritonTemplateKernel(
+            kernel_name=kernel_name,
+            output_node=fake_out,
+            use_jit=False,
+            **kernel_options,
+        ) as kernel:
+            try:
+                template = kernel.render(self.template, kwargs)
+                with kernel.set_subgraph_body("<STORE_OUTPUT>"):
+                    code = template.finalize_all()
+            except ZeroDivisionError:
+                # TODO(nmacchioni): fix sympy division by zero
+                return None
+            if self.debug:
+                print("Generated Code:\n", code)
+            extra = (
+                "-".join(
+                    [
+                        *[
+                            f"{kwarg}={repr(kwargs[kwarg])}"
+                            for kwarg in sorted(kwargs.keys())
+                        ],
+                        f"num_stages={num_stages}",
+                        f"num_warps={num_warps}",
+                    ]
+                )
+                + "-"
+            )
+            mod = PyCodeCache.load(code, extra)
+        input_call_args = tuple(kernel.args.input_buffers.keys())
+        output_call_args = tuple(kernel.args.output_buffers.keys())
+        # We expect the input_buffer order to be [*input_nodes, *captured_buffers]
+        expected_input_args = tuple(unique(x.get_name() for x in input_nodes))
+        expected_output_args = (fake_out.get_name(),)
+        assert input_call_args[: len(expected_input_args)] == expected_input_args, (
+            input_call_args,
+            expected_input_args,
+        )
+        assert output_call_args == expected_output_args, (
+            output_call_args,
+            expected_output_args,
+        )
+        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, tuple(kernel.args.sizevars.keys())),
+            fallback=config.unbacked_symint_fallback,
+        )
+        kernel_hash_name = f"triton_{self.name}_{next(self.index_counter)}"
+        def make_kernel_render(out_node):
+            kernel = TritonTemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME),
+                output_node=out_node,
+                use_jit=False,
+                **kernel_options,
+            )
+            render = functools.partial(
+                kernel.render,
+                self.template,
+                kwargs,
+            )
+            return kernel, render
+        # create the BenchmarkRequest
+        assert mod.__file__ is not None
+        grid = self.grid(
+            *V.graph.sizevars.size_hints(
+                call_sizes,
+                fallback=config.unbacked_symint_fallback,
+            ),
+            kwargs,
+        )
+        bmreq = TritonBenchmarkRequest(
+            module_path=mod.__file__,
+            module_cache_key=mod.key,
+            kernel_name=kernel_name,
+            grid=grid,
+            extra_args=extra_args,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),  # type: ignore[arg-type]
+            output_tensor_meta=TensorMeta.from_irnodes(layout),
+        )
+        return TritonTemplateCaller(
+            kernel_hash_name,
+            full_input_nodes,
+            layout,
+            make_kernel_render,
+            extra.strip("-").replace("-", ", "),
+            bmreq,
+            log_info={
+                "tile_shape": str(
+                    (
+                        kwargs.get("BLOCK_M", -1),
+                        kwargs.get("BLOCK_K", -1),
+                        kwargs.get("BLOCK_N", -1),
+                    )
+                ),
+                "num_stages": num_stages,
+                "num_warps": num_warps,
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
+                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+            },
+            mutated_inputs=mutated_inputs,
+        )
+class ExternKernelChoice:
+    def __init__(
+        self,
+        kernel,
+        cpp_kernel=None,
+        *,
+        name=None,
+        has_out_variant=True,
+        op_overload=None,
+        use_fallback_kernel=False,
+        kernel_creator=None,
+    ) -> None:
+        super().__init__()
+        name = name or kernel.__name__
+        assert callable(kernel)
+        assert not hasattr(extern_kernels, name), f"duplicate extern kernel: {name}"
+        self.name = name
+        self.cpp_kernel_name = cpp_kernel
+        self.has_out_variant = has_out_variant
+        setattr(extern_kernels, name, kernel)
+        self.op_overload = op_overload
+        self.use_fallback_kernel = use_fallback_kernel
+        self.kernel_creator = kernel_creator
+    def to_callable(self):
+        return getattr(extern_kernels, self.name)
+    def call_name(self):
+        return f"extern_kernels.{self.name}"
+    @functools.lru_cache(None)  # noqa: B019
+    def hash_key(self):
+        fn = self.to_callable()
+        parts = [
+            self.name,
+            getattr(fn, "__name__", ""),
+            getattr(fn, "__module__", ""),
+        ]
+        try:
+            parts.append(inspect.getsource(fn))
+        except Exception:
+            pass
+        return code_hash("-".join(parts))
+    def bind(
+        self,
+        input_nodes,
+        layout,
+        ordered_kwargs_for_cpp_kernel=(),
+        **kwargs,
+    ):
+        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
+        return ExternKernelCaller(
+            self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
+        )
+class TritonTemplateCaller(ir.TritonTemplateCallerBase):
+    def __init__(
+        self,
+        name,
+        input_nodes,
+        layout,
+        make_kernel_render,
+        debug_extra,
+        bmreq,
+        log_info: Optional[
+            Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]
+        ] = None,
+        mutated_inputs=None,
+    ) -> None:
+        super().__init__(name, input_nodes, layout)
+        self.make_kernel_render = make_kernel_render
+        self.debug_extra = debug_extra
+        self.bmreq: TritonBenchmarkRequest = bmreq
+        if log_info is None:
+            log_info = {}
+        self.log_info: Dict[str, Any] = log_info
+        self.log_info.update(
+            {
+                "backend": "Triton",
+                "grid": str(self.bmreq.grid),
+                "num_stages": self.bmreq.num_stages,
+                "num_warps": self.bmreq.num_warps,
+            }
+        )
+        self.mutated_inputs = mutated_inputs
+    def benchmark(self, *args, out):
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(*args, output_tensor=out)
+    def precompile(self):
+        assert self.bmreq is not None
+        self.bmreq.precompile()
+    def __str__(self) -> str:
+        return f"TritonTemplateCaller({self.bmreq.module_path}, {self.debug_extra})"
+    def call_name(self):
+        return f"template_kernels.{self.name}"
+    def hash_key(self):
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                self.bmreq.module_cache_key,
+            ]
+        )
+    def output_node(self):
+        return ir.TensorBox.create(
+            ir.TritonTemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+                debug_extra=self.debug_extra,
+                mutated_inputs=self.mutated_inputs,
+            )
+        )
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return self.log_info
+    def get_make_kernel_render(self):
+        return self.make_kernel_render
+    def autoheuristic_id(self):
+        type_name = "triton"
+        info = self.info_dict()
+        # TODO(AlnisM): Does tile_shape always exist?
+        tile = info["tile_shape"]
+        tile_vals = eval(tile)  # type: ignore[arg-type]
+        BLOCK_M = tile_vals[0]
+        BLOCK_K = tile_vals[1]
+        BLOCK_N = tile_vals[2]
+        num_stages = info["num_stages"]
+        num_warps = info["num_warps"]
+        return f"type={type_name}_BLOCK-M={BLOCK_M}_BLOCK-K={BLOCK_K}_BLOCK-N={BLOCK_N}_numstages={num_stages}_numwarps={num_warps}"
+class ExternKernelCaller(ChoiceCaller):
+    def __init__(
+        self,
+        choice: ExternKernelChoice,
+        input_nodes,
+        layout,
+        kwargs=None,
+        *,
+        has_out_variant=True,
+    ) -> None:
+        super().__init__(choice.name, input_nodes, layout)
+        self.choice = choice
+        self.kwargs = kwargs or {}
+        self.has_out_variant = has_out_variant
+    def __str__(self) -> str:
+        return f"ExternKernelCaller({self.choice.call_name()})"
+    def benchmark(self, *args, out):
+        if out.numel() == 0:
+            # no need to run the kerrnel of do benchmarking
+            return 0.0
+        if self.has_out_variant:
+            return super().benchmark(*args, out=out)
+        else:
+            algo = self.to_callable()
+            out_new = algo(*args)
+            torch._C._dynamo.guards.assert_size_stride(
+                out_new, tuple(out.size()), tuple(out.stride())
+            )
+            out.copy_(out_new)  # for correctness checking
+            return benchmarker.benchmark(algo, args, {})
+    def to_callable(self):
+        fn = self.choice.to_callable()
+        if self.kwargs:
+            return functools.partial(fn, **self.kwargs)
+        else:
+            return fn
+    def hash_key(self):
+        return "-".join(
+            [
+                self.choice.name,
+                *[
+                    f"{kwarg}={repr(self.kwargs[kwarg])}"
+                    for kwarg in sorted(self.kwargs.keys())
+                ],
+                self.choice.hash_key(),
+            ]
+        )
+    def output_node(self):
+        if config.abi_compatible and self.choice.use_fallback_kernel:
+            assert (
+                self.choice.op_overload is not None
+            ), "Please provide an op_overload to use ir.FallbackKernel"
+            inner = ir.FallbackKernel.create(
+                self.choice.op_overload, *self.input_nodes, **self.kwargs
+            )
+        elif self.choice.kernel_creator is not None:
+            inner = self.choice.kernel_creator(*self.input_nodes, **self.kwargs)
+        else:
+            cls = ir.ExternKernelOut if self.has_out_variant else ir.ExternKernelAlloc
+            inner = cls(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                python_kernel_name=self.choice.call_name(),
+                cpp_kernel_name=self.choice.cpp_kernel_name,
+                ordered_kwargs_for_cpp_kernel=self.choice.ordered_kwargs_for_cpp_kernel,
+                op_overload=self.choice.op_overload,
+                kwargs=self.kwargs,
+            )
+        return ir.TensorBox.create(inner)
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "extern",
+            "kernel_call_name": self.choice.call_name(),
+        }
+    def autoheuristic_id(self):
+        return f"extern_{self.choice.name}"
+@functools.lru_cache(None)
+def get_mm_log_filename() -> Optional[str]:
+    mm_file_name = os.environ.get("TORCHINDUCTOR_MM_LOGGING_FILE", None)
+    if not mm_file_name:
+        return None
+    if "json" not in mm_file_name:
+        mm_file_name = f"{mm_file_name}.json"
+    return mm_file_name
+def append_to_log(filename, data):
+    lock_file = filename.replace(".json", ".lock")
+    lock = FileLock(lock_file)
+    with lock:
+        try:
+            with open(filename) as f:
+                log_data = json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError):
+            log_data = []
+        log_data.append(data)
+        with open(filename, "w") as f:
+            json.dump(log_data, f, indent=4)
+class DataProcessorChoiceCallerWrapper:
+    def __init__(self, wrapped, preprocessor, postprocessor) -> None:
+        self._wrapped = wrapped
+        if preprocessor is not None:
+            self._preprocessor = preprocessor
+        else:
+            self._preprocessor = lambda x, y: (x, y)
+        if postprocessor is not None:
+            self._postprocessor = postprocessor
+        else:
+            self._postprocessor = lambda x: x
+    def __getattr__(self, name):
+        return getattr(self._wrapped, name)
+    def benchmark(self, *args, out) -> float:
+        new_args, new_out = self._preprocessor(args, out)
+        result = self._wrapped.benchmark(*new_args, out=new_out)
+        new_out = self._postprocessor(new_out)
+        if out is not new_out:
+            out.copy_(new_out)
+        return result
+    def output_node(self) -> ir.TensorBox:
+        result = self._wrapped.output_node()
+        return self._postprocessor(result)
+    def __repr__(self) -> str:
+        return f"DataProcessorChoiceCallerWrapper({self._wrapped})"
+class DataProcessorTemplateWrapper:
+    """
+    A wrapper class for a kernel template.
+    This class together with `DataProcessorChoiceCallerWrapper` provides a convenient way to
+    preprocess and postprocess data before and after using the wrapped template. A typical
+    usage is to reorder or filter the input nodes in order to match the expected input of other
+    kernel choices like a ATen kernel. A more complicated usage is to prepack the weights.
+    See the example from :mod:`cpp_gemm_template` for more details.
+    """
+    def __init__(
+        self,
+        wrapped_template_cls,
+        preprocessor,
+        postprocessor,
+        **kwargs,
+    ) -> None:
+        if preprocessor is not None:
+            self._preprocessor = preprocessor
+        else:
+            self._preprocessor = lambda x, y: (x, y)
+        if postprocessor is not None:
+            self._postprocessor = postprocessor
+        else:
+            self._postprocessor = lambda x: x
+        assert "input_nodes" in kwargs
+        assert "layout" in kwargs
+        kwargs["input_nodes"], kwargs["layout"] = preprocessor(
+            kwargs["input_nodes"], kwargs["layout"]
+        )
+        self._wrapped = wrapped_template_cls(**kwargs)
+    def __getattr__(self, name):
+        return getattr(self._wrapped, name)
+    def maybe_append_choice(self, choices, **kwargs):
+        return type(self._wrapped).maybe_append_choice(self, choices, **kwargs)
+    def generate(self, **kwargs):
+        choice_caller = self._wrapped.generate(**kwargs)
+        return DataProcessorChoiceCallerWrapper(
+            choice_caller, self._preprocessor, self._postprocessor
+        )
+    def __repr__(self) -> str:
+        return f"DataProcessorTemplateWrapper({self._wrapped})"
+class ErrorFromChoice(RuntimeError):
+    def __init__(self, msg, choice: ChoiceCaller, inputs_str) -> None:
+        msg += f"\nFrom choice {choice}\n{inputs_str}"
+        super().__init__(msg)
+        self.choice = choice
+class NoValidChoicesError(RuntimeError):
+    pass
+@functools.lru_cache(None)
+def get_env_num_workers() -> Optional[int]:
+    if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
+        return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
+    return None
+def create_inputs_key(input_nodes) -> str:
+    return repr([AlgorithmSelectorCache.key_of(x) for x in input_nodes])
+def create_precompile_key(
+    name: str, inputs_key: str, choices: List[ChoiceCaller]
+) -> str:
+    return ":".join(
+        [
+            name,
+            inputs_key,
+            torch.get_float32_matmul_precision(),
+        ]
+        + [choice.hash_key() for choice in choices]
+    )
+class AlgorithmSelectorCache(PersistentCache):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        # the autotuning will get occur in the scheduler, so there is
+        # no guarantee that the first lowering for a given key will also be the
+        # first to benchmark it. share a single precompilation function for all lowerings
+        # of a particular key
+        self.precompile_cache: Dict[str, Callable[[], None]] = {}
+        # list of callbacks that are called after benchmarking
+        self.feedback_saver_fns: List[
+            Callable[
+                [Dict[ChoiceCaller, float], str, List[Any], List[ChoiceCaller]], None
+            ]
+        ] = []
+    def __call__(
+        self,
+        name,
+        choices: List[ChoiceCaller],
+        input_nodes,
+        layout,
+        # optional dict mapping arg indices to the functions
+        # generating a torch.Tensor for that input from the
+        # corresponding ir.Buffer. if passed for a given
+        # arg, the function will be called instead of
+        # generating a random torch.Tensor for benchmarking.
+        input_gen_fns: Optional[Dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
+        precompilation_timeout_seconds: int = 60 * 60,
+        return_multi_template=False,
+    ):
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+        # Templates selected with input_gen_fns require specific input data to avoid IMA
+        # Passing custom input gen fns to benchmark_fusion NYI, so skip deferred template selection
+        # TODO(jgong5): support multi-template on CPU
+        if input_gen_fns is not None or layout.device.type == "cpu":
+            return_multi_template = False
+        # TODO - assert that we have not mutating kernels here
+        # TODO(nmacchioni): remove once CI tests are fixed
+        choices = [choice for choice in choices if choice is not None]
+        if mm_file_name := get_mm_log_filename():
+            M, K = input_nodes[-2].get_size()[:2]
+            N = input_nodes[-1].get_size()[-1]
+            append_to_log(mm_file_name, {"invoke": str((M, K, N))})
+        if len(choices) == 0:
+            backend_config = (
+                "max_autotune_gemm_backends"
+                if name != "convolution"
+                else "max_autotune_conv_backends"
+            )
+            raise NoValidChoicesError(
+                f"No choices to select, please consider adding ATEN into {backend_config} "
+                "config (defined in torch/_inductor/config.py) to allow at least one choice. "
+            )
+        log.debug("Max autotune selects from %s choices.", str(len(choices)))
+        if len(choices) == 1:
+            if not isinstance(choices[0], CUDATemplateCaller):
+                # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
+                return choices[0].output_node()
+        @functools.lru_cache(None)
+        def make_benchmark_fn():
+            return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
+        inputs_key = create_inputs_key(input_nodes)
+        def precompile(choices) -> Callable[[], None]:
+            def no_op(*args, **kwargs):
+                return
+            if (
+                precompilation_timeout_seconds is None
+                or precompilation_timeout_seconds <= 0
+            ):
+                return no_op
+            env_workers = get_env_num_workers()
+            num_workers = env_workers if env_workers is not None else (len(choices))
+            if num_workers <= 0:
+                return no_op
+            # https://github.com/python/cpython/issues/106905
+            if (
+                sys.version_info.major == 3
+                and sys.version_info.minor == 11
+                and sys.version_info.micro <= 8
+            ):
+                return no_op
+            # check local and global cache before precompiling
+            timings = self.lookup(
+                choices,
+                name,
+                inputs_key,
+                benchmark=None,
+            )
+            if timings:
+                return no_op
+            precompile_key = create_precompile_key(name, inputs_key, choices)
+            if precompile_func := self.precompile_cache.get(precompile_key):
+                return precompile_func
+            log.info(
+                "Multithreaded precompilation for %d choices using %d worker threads",
+                len(choices),
+                num_workers,
+            )
+            # In rare circumstances, because python threads inherit global state,
+            # thread pool executor can race and leave stdout/stderr in a state
+            # different than the original values. we explicitly restore the state
+            # here to avoid this issue.
+            initial_stdout = sys.stdout
+            initial_stderr = sys.stderr
+            def precompile_with_captured_stdout(choice):
+                with restore_stdout_stderr(initial_stdout, initial_stderr):
+                    return choice.precompile()
+            executor = ThreadPoolExecutor(max_workers=num_workers)
+            futures = {}
+            for c in choices:
+                if hasattr(c, "precompile"):
+                    future = executor.submit(precompile_with_captured_stdout, c)
+                    futures[future] = c
+            @functools.lru_cache(None)
+            @restore_stdout_stderr(initial_stdout, initial_stderr)
+            def wait_on_futures():
+                counters["inductor"]["select_algorithm_precompile"] += 1
+                for future in as_completed(
+                    futures,
+                    timeout=precompilation_timeout_seconds,
+                ):
+                    if e := future.exception():
+                        log.error(
+                            "Exception %s for benchmark choice %s", e, futures[future]
+                        )
+                executor.shutdown(wait=True)
+            self.precompile_cache[precompile_key] = wait_on_futures
+            return wait_on_futures
+        def autotune(choices):
+            return make_benchmark_fn()(choices)
+        if config.autotune_in_subproc:
+            from .autotune_process import tuning_pool
+            # do the optional warmup
+            tuning_pool.initialize()
+        def do_autotuning(precompile_fn):
+            precompile_start_ts = time.time()
+            precompile_fn()
+            precompile_elapse = time.time() - precompile_start_ts
+            autotune_start_ts = time.time()
+            timings = self.lookup(
+                choices,
+                name,
+                inputs_key,
+                autotune,
+            )
+            autotune_elapse = time.time() - autotune_start_ts
+            if timings and all(
+                not math.isfinite(timing) for timing in timings.values()
+            ):
+                raise NoValidChoicesError
+            if make_benchmark_fn.cache_info().currsize:
+                counters["inductor"]["select_algorithm_autotune"] += 1
+            if (
+                make_benchmark_fn.cache_info().currsize
+                or log.getEffectiveLevel() == logging.DEBUG
+                or config.trace.log_autotuning_results
+            ):
+                self.log_results(
+                    name, input_nodes, timings, autotune_elapse, precompile_elapse
+                )
+            for feedback_fn in self.feedback_saver_fns:
+                feedback_fn(timings, name, input_nodes, choices)
+            return timings
+        precompile_fn = precompile(choices)
+        if return_multi_template and (config.max_autotune or config.max_autotune_gemm):
+            def get_timings():
+                timings = do_autotuning(precompile_fn)
+                min_extern_choice = float("inf")
+                for choice, timing in timings.items():
+                    if isinstance(choice, ExternKernelCaller):
+                        min_extern_choice = min(min_extern_choice, timing)
+                timings = {
+                    choice: time
+                    for choice, time in timings.items()
+                    if (
+                        time <= min_extern_choice
+                        or not isinstance(choice, ExternKernelCaller)
+                    )
+                }
+                return timings
+            return torch._inductor.ir.TensorBox.create(
+                torch._inductor.ir.MultiTemplateBuffer(
+                    layout,
+                    input_nodes,
+                    get_timings,
+                )
+            )
+        # TODO - dont want to precompile if we have a cache hit
+        timings = do_autotuning(precompile_fn)
+        if timings == {} or choices[0] not in timings:
+            return choices[0].output_node()
+        selected_key = builtins.min(timings, key=timings.__getitem__)
+        selected_time = timings[selected_key]
+        selected_choice = selected_key.output_node()
+        log.debug("selected choice: %s", str(selected_choice))
+        return selected_choice
+    @classmethod
+    def make_benchmark_fn(
+        cls,
+        choices,
+        input_nodes,
+        layout,
+        input_gen_fns=None,
+    ):
+        if input_gen_fns is None:
+            input_gen_fns = {}
+        def get_inputs():
+            # de-duplicate args
+            unique_example_inputs = {
+                x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+                for i, x in enumerate(input_nodes)
+            }
+            example_inputs = list(unique_example_inputs.values())
+            example_inputs_extern = [
+                unique_example_inputs[input_node.get_name()]
+                if unique_example_inputs[input_node.get_name()].is_mkldnn
+                else torch.as_strided(
+                    unique_example_inputs[input_node.get_name()],
+                    V.graph.sizevars.size_hints(
+                        input_node.get_size(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hints(
+                        input_node.get_stride(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hint(
+                        input_node.get_layout().offset,
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                )
+                for input_node in input_nodes
+            ]
+            out = cls.benchmark_example_value(layout)
+            out_extern = torch.as_strided(
+                out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
+            )
+            expected = None
+            if VERIFY:
+                choices[0].benchmark(*example_inputs_extern, out=out_extern)
+                expected = out_extern.clone()
+            return example_inputs, example_inputs_extern, out, out_extern, expected
+        if DEBUG:
+            print(f"{len(choices)} tuning requests:")
+        def debug_str(example_inputs, out):
+            def tensor_repr(x):
+                return (
+                    f"torch.empty_strided({tuple(x.size())!r}, {tuple(x.stride())!r}, "
+                    f"dtype={x.dtype!r}, device={x.device.type!r})"
+                )
+            lines = [
+                "inputs = [",
+            ]
+            for x in example_inputs:
+                lines.append(f"    {tensor_repr(x)},")
+            lines += ["]", f"out = {tensor_repr(out)}", ""]
+            return "\n".join(lines)
+        def benchmark_choice_in_current_process(
+            choice, example_inputs, example_inputs_extern, out, out_extern, expected
+        ):
+            out.zero_()
+            if isinstance(choice, ExternKernelCaller):
+                # aten kernels want the offset baked in for sliced tensors
+                result = choice.benchmark(*example_inputs_extern, out=out_extern)
+            else:
+                # triton templates want the base pointer for sliced tensors
+                result = choice.benchmark(*example_inputs, out=out)
+            if VERIFY and expected is not None:
+                torch.testing.assert_close(out_extern, expected, **VERIFY)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()  # shake out any CUDA errors
+            return result
+        def benchmark_in_current_process(choices):
+            inputs = get_inputs()
+            example_inputs, _, out, _, _ = inputs
+            timings = {}
+            for choice in choices:
+                try:
+                    timing = benchmark_choice_in_current_process(choice, *inputs)
+                except CUDACompileError as e:
+                    log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        str(e),
+                    )
+                    timing = float("inf")
+                except NotImplementedError as e:
+                    log.warning("Not yet implemented: %s", e)
+                    timing = float("inf")
+                except RuntimeError as e:
+                    msg = str(e)
+                    if "invalid argument" in msg:
+                        msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                    else:
+                        if "illegal memory access" in msg:
+                            msg += "\n\nEither error in template or triton bug.\n"
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
+                    timing = float("inf")
+                except AssertionError as e:
+                    raise AssertionError(  # noqa: B904
+                        f"Incorrect result from choice {choice}\n\n{e}"
+                    )
+                except Exception as e:
+                    try:
+                        from triton.runtime.autotuner import OutOfResources
+                        if isinstance(e, OutOfResources):
+                            log.warning(e)
+                            timing = float("inf")
+                        else:
+                            raise e
+                    except ImportError:
+                        raise e from None
+                timings[choice] = timing
+            return timings
+        def benchmark_in_sub_process(choices):
+            from . import autotune_process
+            # only benchmark triton kernel in sub process for now.
+            # ATen/Extern kernel are still benchmarked in the current process.
+            extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
+            triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+            timings = benchmark_in_current_process(extern)
+            timings.update(autotune_process.benchmark_in_sub_process(triton))
+            return timings
+        benchmark = (
+            benchmark_in_sub_process
+            if config.autotune_in_subproc
+            else benchmark_in_current_process
+        )
+        return benchmark
+    @staticmethod
+    def log_results(
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict[ChoiceCaller, float],
+        elapse: float,
+        precompile_elapse: float,
+    ):
+        V.debug.log_autotuning_results(
+            name, input_nodes, timings, elapse, precompile_elapse
+        )
+        if not (config.max_autotune or config.max_autotune_gemm) or not PRINT_AUTOTUNE:
+            return
+        sizes = ", ".join(
+            [
+                "x".join(
+                    map(
+                        str,
+                        V.graph.sizevars.size_hints(
+                            n.get_size(), fallback=config.unbacked_symint_fallback
+                        ),
+                    )
+                )
+                for n in input_nodes
+            ]
+        )
+        n = None if log.getEffectiveLevel() == logging.DEBUG else 10
+        top_k = sorted(timings, key=timings.__getitem__)[:n]
+        best = top_k[0]
+        def get_choice_info(choice):
+            if isinstance(choice, torch._inductor.select_algorithm.ExternKernelCaller):
+                return {"type": "cublas", "time": timings[choice]}
+            assert isinstance(
+                choice, torch._inductor.select_algorithm.TritonTemplateCaller
+            )
+            info = choice.info_dict()
+            tile = info["tile_shape"]
+            tile_vals = eval(tile)  # type: ignore[arg-type]
+            BLOCK_M = tile_vals[0]
+            BLOCK_K = tile_vals[1]
+            BLOCK_N = tile_vals[2]
+            return {
+                "type": "triton",
+                "time": timings[choice],
+                "BLOCK_M": BLOCK_M,
+                "BLOCK_K": BLOCK_K,
+                "BLOCK_N": BLOCK_N,
+                "num_stages": info["num_stages"],
+                "num_warps": info["num_warps"],
+            }
+        mm_filename = get_mm_log_filename()
+        if mm_filename and "mm" in name:
+            M, K = input_nodes[-2].get_size()[:2]
+            N = input_nodes[-1].get_size()[-1]
+            out_dict = {
+                str((M, K, N)): [get_choice_info(choice) for choice in timings.keys()]
+            }
+            append_to_log(mm_filename, out_dict)
+        best_time = timings[best]
+        sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
+        for choice in top_k:
+            result = timings[choice]
+            if result:
+                kernel_info = (
+                    choice.debug_extra if hasattr(choice, "debug_extra") else ""
+                )
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms {best_time / result:.1%} {kernel_info}\n"
+                )
+            else:
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms <DIVIDED BY ZERO ERROR>\n"
+                )
+        autotune_type_str = (
+            "SubProcess" if config.autotune_in_subproc else "SingleProcess"
+        )
+        sys.stderr.write(
+            f"{autotune_type_str} AUTOTUNE benchmarking takes {elapse:.4f} seconds and {precompile_elapse:.4f}"
+            " seconds precompiling\n"
+        )
+    @staticmethod
+    def benchmark_example_value(node):
+        """
+        Convert an ir.Buffer into a concrete torch.Tensor we can use for
+        benchmarking.
+        """
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+        # triton templates want the base tensor.
+        if isinstance(node, ir.BaseView):
+            node = node.unwrap_view()
+        return AlgorithmSelectorCache.generate_example_value(
+            V.graph.sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            V.graph.sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            node.get_device(),
+            node.get_dtype(),
+            node.layout.offset,
+        )
+    @staticmethod
+    def generate_example_value(size, stride, device, dtype, extra_size):
+        # preserve rng states to avoid the rand_strided call below changes
+        # the rng states for the real model code.
+        with preserve_rng_state():
+            return rand_strided(
+                size,
+                stride,
+                device=device,
+                dtype=dtype,
+                extra_size=extra_size,
+            )
+    @staticmethod
+    def key_of(node):
+        """
+        Extract the pieces of an ir.Buffer that we should invalidate cached
+        autotuning results on.
+        """
+        sizevars = V.graph.sizevars
+        return (
+            node.get_device().type,
+            str(node.get_dtype()),
+            *sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            *sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+        )
+    def add_feedback_saver(
+        self,
+        fn: Callable[
+            [Dict[ChoiceCaller, float], str, List[Any], List[ChoiceCaller]], None
+        ],
+    ):
+        self.feedback_saver_fns.append(fn)
+_ALGORITHM_SELECTOR_CACHE: Optional[AlgorithmSelectorCache] = None
+def autotune_select_algorithm(*args, **kwargs):
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+    if "return_multi_template" not in kwargs:
+        kwargs[
+            "return_multi_template"
+        ] = torch._inductor.config.benchmark_epilogue_fusion
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+def add_feedback_saver(
+    fn: Callable[[Dict[ChoiceCaller, float], str, List[Any], List[ChoiceCaller]], None]
+):
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+    _ALGORITHM_SELECTOR_CACHE.add_feedback_saver(fn)
+def realize_inputs(*args):
+    if len(args) == 1:
+        return ir.ExternKernel.require_stride1(ir.ExternKernel.realize_input(args[0]))
+    return [realize_inputs(x) for x in args]
+# ensure lowering is imported so that `extern_kernels.*` is populated
+from . import lowering  # noqa: F401