ZAIDX11 commited on Nov 7, 2025

Commit

28780eb

verified ·

1 Parent(s): aafbbaf

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

archive/.venv/Lib/site-packages/torch/_C/_cudnn.pyi +14 -0
archive/.venv/Lib/site-packages/torch/_C/_cusparselt.pyi +1 -0
archive/.venv/Lib/site-packages/torch/_C/_distributed_autograd.pyi +26 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/executorch_call_delegate.py +175 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/flat_apply.py +125 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/flex_attention.py +1268 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/foreach_map.py +23 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/hints_wrap.py +142 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/invoke_subgraph.py +658 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/map.py +291 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/out_dtype.py +163 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/run_const_graph.py +60 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/scan.py +929 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/schema.py +306 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/strict_mode.py +108 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/torchbind.py +164 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py +2051 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/utils.py +1134 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/while_loop.py +420 -0
archive/.venv/Lib/site-packages/torch/_higher_order_ops/wrap.py +286 -0

archive/.venv/Lib/site-packages/torch/_C/_cudnn.pyi ADDED Viewed

	@@ -0,0 +1,14 @@

+from enum import IntEnum
+# Defined in torch/csrc/cuda/shared/cudnn.cpp
+is_cuda: bool
+def getRuntimeVersion() -> tuple[int, int, int]: ...
+def getCompileVersion() -> tuple[int, int, int]: ...
+def getVersionInt() -> int: ...
+class RNNMode(IntEnum):
+    rnn_relu = ...
+    rnn_tanh = ...
+    lstm = ...
+    gru = ...

archive/.venv/Lib/site-packages/torch/_C/_cusparselt.pyi ADDED Viewed

	@@ -0,0 +1 @@


1	+ def getVersionInt() -> int: ...

archive/.venv/Lib/site-packages/torch/_C/_distributed_autograd.pyi ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import Any
+import torch
+# This module is defined in torch/csrc/distributed/autograd/init.cpp
+class DistAutogradContext:
+    def _context_id(self) -> int: ...
+    def _recv_functions(self) -> dict[int, Any]: ...
+    def _send_functions(self) -> dict[int, Any]: ...
+    def _known_worker_ids(self) -> set[int]: ...
+def _new_context() -> DistAutogradContext: ...
+def _release_context(context_id: int) -> None: ...
+def _get_max_id() -> int: ...
+def _is_valid_context(worker_id: int) -> bool: ...
+def _retrieve_context(context_id: int) -> DistAutogradContext: ...
+def _current_context() -> DistAutogradContext: ...
+def _init(worker_id: int) -> None: ...
+def _get_debug_info() -> dict[str, str]: ...
+def backward(
+    context_id: int,
+    roots: list[torch.Tensor],
+    retain_graph: bool = False,
+) -> None: ...
+def get_gradients(context_id: int) -> dict[torch.Tensor, torch.Tensor]: ...

archive/.venv/Lib/site-packages/torch/_higher_order_ops/executorch_call_delegate.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# mypy: allow-untyped-defs
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from __future__ import annotations
+from typing import Any, cast
+import torch
+import torch.utils._pytree as pytree
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    get_proxy_slot,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._pytree import tree_flatten
+class ExecutorchCallDelegate(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("executorch_call_delegate")
+    def __call__(self, lowered_module, *args):
+        return super().__call__(lowered_module, *args)
+executorch_call_delegate = ExecutorchCallDelegate()
+executorch_call_delegate.fallthrough(torch._C.DispatchKey.PythonDispatcher)
+executorch_call_delegate.fallthrough(torch._C.DispatchKey.PythonTLSSnapshot)
+executorch_call_delegate.fallthrough(torch._C.DispatchKey.ADInplaceOrView)
+executorch_call_delegate.fallthrough(torch._C.DispatchKey.AutocastCPU)
+LOWERED_BACKEND_MODULE_TYPE = "LoweredBackendModule"
+# pyre-ignore
+def trace_call_delegate(proxy_mode, func_overload, lowered_module, *args):
+    # pyre-ignore
+    def _unwrap_proxy(e):
+        if not isinstance(e, (torch.Tensor, torch.SymInt, torch.SymFloat)):
+            return e
+        return get_proxy_slot(
+            cast(torch.Tensor, e), proxy_mode.tracer, e, lambda e: e.proxy  # type: ignore[attr-defined]
+        )
+    if not is_lowered_module(lowered_module):
+        raise ValueError(
+            "executorch_call_delegate()'s first argument must be a LoweredBackendModule"
+        )
+    with disable_proxy_modes_tracing():
+        out = call_delegate_cpu(lowered_module, *args)
+    get_lowered_module_name(proxy_mode.tracer.root, lowered_module)
+    node_args = (lowered_module, *args)
+    proxy_args = pytree.tree_map(_unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="executorch_call_delegate"
+    )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@executorch_call_delegate.py_impl(torch._C.DispatchKey.CompositeExplicitAutograd)
+# pyre-ignore
+def call_delegate_cpu(lowered_module, *args):
+    # FX creates this immutable_dict/list concept. Get rid of this.
+    map_types: dict[type, type] = {
+        torch.fx.immutable_collections.immutable_dict: dict,
+        torch.fx.immutable_collections.immutable_list: list,
+    }
+    new_args = pytree.tree_map_only(
+        tuple(map_types.keys()),
+        lambda a: map_types[type(a)](a),
+        args,
+        lambda a: isinstance(a, tuple(map_types.keys())),
+    )
+    return lowered_module.original_module.module()(*new_args)
+@executorch_call_delegate.py_autograd_impl
+# pyre-ignore
+def call_delegate_autograd(lowered_module, *args):
+    # TODO: support autograd
+    flat_operands, _ = tree_flatten([lowered_module, *args])
+    requires_grad = any(
+        f.requires_grad for f in flat_operands if isinstance(f, torch.Tensor)
+    )
+    with torch._C._ExcludeDispatchKeyGuard(
+        torch._C.DispatchKeySet(torch._C.DispatchKey.AutogradCPU)
+    ):
+        res = executorch_call_delegate(lowered_module, *args)
+        if requires_grad:
+            # Create aliases of the output that has requires_grad=True. We need
+            # at least one of the inputs to err_fn to require grad so that the
+            # output will have a grad_fn.
+            # pyre-ignore
+            def fake_requires_grad(var):
+                if var is not None:
+                    var = var.detach()
+                    if torch.is_floating_point(var) or torch.is_complex(var):
+                        var.requires_grad = True
+                return var
+            return pytree.tree_map_only(torch.Tensor, fake_requires_grad, res)
+        return res
+@executorch_call_delegate.py_impl(ProxyTorchDispatchMode)
+# pyre-ignore
+def call_delegate_proxy_torch_dispatch_mode(mode, lowered_module, *args):
+    res = trace_call_delegate(mode, executorch_call_delegate, lowered_module, *args)
+    return res
+@executorch_call_delegate.py_impl(FakeTensorMode)
+# pyre-ignore
+def call_delegate_fake_tensor_mode(mode, lowered_module, *args):
+    with mode:
+        return call_delegate_cpu(lowered_module, *args)
+@executorch_call_delegate.py_functionalize_impl
+# pyre-ignore
+def call_delegate_functionalize(ctx, lowered_module, *args):
+    unwrapped_args = tuple(ctx.unwrap_tensors(arg) for arg in args)
+    with ctx.redispatch_to_next():
+        res = executorch_call_delegate(lowered_module, *unwrapped_args)
+        return ctx.wrap_tensors(res)
+# pyre-ignore: Missing parameter annotation [2]: Parameter `obj` must have a type other than `Any`.Pyre
+def is_lowered_module(obj: Any) -> bool:
+    """
+    This function is added to avoid using isinstance(obj,
+    LoweredBackendModule) as it will import LoweredBackendModule, which may
+    cause a circular import.
+    """
+    return type(obj).__name__ == LOWERED_BACKEND_MODULE_TYPE
+def get_lowered_module_name(
+    root: torch.nn.Module,
+    # pyre-ignore: Undefined or invalid type [11]: Annotation `LoweredBackendModule` is not defined as a type.
+    lowered_module: LOWERED_BACKEND_MODULE_TYPE,  # type: ignore[valid-type]
+) -> str:
+    """
+    Adds the given lowered_module into the given root module and returns the
+    name of the module added.
+    """
+    # Find a qualifying name for the lowered submodule
+    qualname = None
+    i = 0
+    while True:
+        qualname = f"lowered_module_{i}"
+        if not hasattr(root, qualname):
+            break
+        i += 1
+    assert qualname is not None
+    root.add_module(qualname, lowered_module)
+    return qualname

archive/.venv/Lib/site-packages/torch/_higher_order_ops/flat_apply.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# mypy: allow-untyped-defs
+from dataclasses import dataclass
+from typing import Callable
+import torch
+import torch.fx.node
+import torch.utils._pytree as pytree
+from torch._ops import HigherOrderOperator
+def is_graphable(val) -> bool:
+    """Definition: a graphable type is a type that that is an acceptable input/output type to a FX node."""
+    return isinstance(val, torch.fx.node.base_types)
+def is_graphable_type(typ) -> bool:
+    """Return whether the given type is graphable"""
+    return issubclass(typ, torch.fx.node.base_types)
+def to_graphable(stuff):
+    """Flattens stuff into a flat list of graphable types."""
+    # We can consider preserving things like List[int] to improve
+    # perf and readability (right now that is all flattened out)
+    flat_args, spec = pytree.tree_flatten(stuff)
+    for arg in flat_args:
+        if not is_graphable(arg):
+            raise RuntimeError(
+                f"Expected all pytree.tree_leaves of (args, kwargs) to be graphable types, but found "
+                f"non-fx-graphable type {type(arg)}. If this type is meant to be constant, mark it as "
+                f"via pytree.register_constant; otherwise, register it as a pytree."
+            )
+    return flat_args, spec
+def from_graphable(flat_args, spec):
+    """The inverse of to_graphable."""
+    stuff = pytree.tree_unflatten(flat_args, spec)
+    return stuff
+def func_to_graphable(func):
+    """
+    Pack and flatten a function type into graphable types.
+    This is useful for legalizing the function argument of `flat_apply`.
+    """
+    return pytree.tree_flatten(_ConstantFunction(func))
+@dataclass(frozen=True)
+class _ConstantFunction:
+    func: Callable
+    def __call__(self, *args, **kwargs):
+        return self.func(*args, **kwargs)
+pytree.register_constant(_ConstantFunction)
+_op_types = (
+    torch._ops.OpOverload,
+    torch._ops.OpOverloadPacket,
+    torch._ops.HigherOrderOperator,
+)
+class FlatApply(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("flat_apply")
+    def __call__(self, func, in_spec, *flat_args, **_unused):
+        """
+        Functions that take in non-graphable types cannot directly be put into FX graph.
+        Given func(*args, **kwargs), if all of the non-graphable types are pytrees,
+        then we're able to store a call to flat_apply(func, in_spec, *flat_args) in the FX graph.
+        The semantics of flat_apply(func, in_spec, *flat_args) are roughly equivalent to:
+        >>> def flat_apply_impl(func, in_spec, *flat_args):
+        >>>     args, kwargs = pytree.tree_unflatten(flat_args, in_spec)
+        >>>     output = func(*args, **kwargs)
+        >>>     return output
+        flat_apply supports the following two cases:
+        - an input type is a container type (e.g. of tensors) registered as a pytree.
+        We'll tree_flatten the input type and store the spec.
+        - an input type is a constant type (i.e. torch.compile will specialize on it)
+        registered with pytree.register_constant. The constant type goes directly
+        into the spec.
+        """
+        assert isinstance(func, _op_types) or pytree._is_constant_holder(func)
+        assert len(_unused) == 0
+        return impl(func, in_spec, *flat_args)
+def impl(func, in_spec, *flat_args):
+    if not isinstance(func, _op_types):
+        # assume _ConstantFunction
+        func = pytree._retrieve_constant(func)
+        assert isinstance(func, _ConstantFunction)
+    args, kwargs = from_graphable(flat_args, in_spec)
+    out = func(*args, **kwargs)
+    # Right now, all outputs must either be graphable or lists/tuples of graphables.
+    #
+    # TODO: The following can be updated to support non-graphable outputs and pytrees.
+    # For non-graphable constant outputs: the assumption would be that they are constant
+    # (everytime the function runs those MUST be the same)
+    # For pytree outputs:
+    # I'm not sure if we need to return (flat_output, spec) or just (flat_output,):
+    # in the latter case the tracers need to carry out the output specs
+    # (they need to know how to reconstruct the object from just the flat_output).
+    def is_valid_output(x):
+        if isinstance(x, (tuple, list)):
+            return all(map(is_valid_output, x))
+        return is_graphable(x)
+    assert is_valid_output(out)
+    return out
+flat_apply = FlatApply()

archive/.venv/Lib/site-packages/torch/_higher_order_ops/flex_attention.py ADDED Viewed

	@@ -0,0 +1,1268 @@

+import math
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_mutation,
+    _maybe_reenter_make_fx,
+    autograd_not_implemented,
+    has_user_subclass,
+    redirect_to_mode,
+    reenter_make_fx,
+    register_fake,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+    UnsupportedAliasMutationException,
+    validate_subgraph_args_types,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses import FakeTensor
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch.fx.experimental.proxy_tensor import (
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.graph_module import GraphModule
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
+# Duplicate of _inductor/kernel/flex_attention.py to avoid circular import
+def _construct_strides(
+    sizes: Sequence[int],
+    fill_order: Sequence[int],
+) -> Sequence[int]:
+    """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
+    # Initialize strides
+    assert len(sizes) == len(
+        fill_order
+    ), "Length of sizes must match the length of the fill order"
+    strides = [0] * len(sizes)
+    # Start with stride 1 for the innermost dimension
+    current_stride = 1
+    # Iterate through the fill order populating strides
+    for dim in fill_order:
+        strides[dim] = current_stride
+        current_stride *= sizes[dim]
+    return strides
+def _permute_strides(out: torch.Tensor, query_strides: tuple[int, ...]) -> torch.Tensor:
+    """
+    Create a new tensor with the same data and shape as the input,
+    but with strides permuted based on the input tensor's stride order.
+    Args:
+        out (torch.Tensor): The output tensor of attention.
+        query_strides (List[int]): The stride order of the input query tensor
+    Returns:
+        torch.Tensor: A new tensor with same shape and data as the input,
+        but with strides permuted based on the query tensor's stride order.
+    """
+    from torch._inductor.ir import get_fill_order
+    fill_order = get_fill_order(query_strides)
+    assert out.storage_offset() == 0, "Only support storage_offset == 0"
+    out_strides = _construct_strides(out.shape, fill_order)
+    new_out = out.new_empty(out.shape).as_strided(out.shape, out_strides)
+    new_out.copy_(out)
+    return new_out
+class FlexAttentionHOP(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("flex_attention", cacheable=True)
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        score_mod: Callable,
+        block_mask: tuple,
+        scale: float,
+        kernel_options: dict[str, Any],
+        score_mod_other_buffers: tuple = (),
+        mask_mod_other_buffers: tuple = (),
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
+        return super().__call__(
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+flex_attention = FlexAttentionHOP()
+class FlexAttentionBackwardHOP(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("flex_attention_backward")
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        logsumexp: torch.Tensor,
+        grad_out: torch.Tensor,
+        grad_logsumexp: torch.Tensor,
+        fw_graph: Union[Callable, GraphModule],
+        joint_graph: GraphModule,
+        block_mask: tuple,
+        scale: float,
+        kernel_options: dict[str, Any],
+        score_mod_other_buffers: tuple = (),
+        mask_mod_other_buffers: tuple = (),
+    ) -> tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+    ]:
+        validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
+        return super().__call__(
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            fw_graph,
+            joint_graph,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+flex_attention_backward = FlexAttentionBackwardHOP()
+def _math_attention_inner(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    working_precision = torch.float64 if query.dtype == torch.float64 else torch.float32
+    scores = (query @ key.transpose(-2, -1)).to(dtype=working_precision)
+    b = torch.arange(0, scores.size(0), device=scores.device)
+    h = torch.arange(0, scores.size(1), device=scores.device)
+    m = torch.arange(0, scores.size(2), device=scores.device)
+    n = torch.arange(0, scores.size(3), device=scores.device)
+    captured_buffers_in_dim = (None,) * len(score_mod_other_buffers)
+    from torch.nn.attention.flex_attention import _vmap_for_bhqkv
+    # first input is score
+    score_mod = _vmap_for_bhqkv(score_mod, prefix=(0,), suffix=captured_buffers_in_dim)
+    mask_mod = block_mask[-1]
+    mask_mod_in_dim_buffers = (None,) * len(mask_mod_other_buffers)
+    mask_mod = _vmap_for_bhqkv(mask_mod, prefix=(), suffix=mask_mod_in_dim_buffers)
+    with TransformGetItemToIndex():
+        scores = (scores * scale).to(working_precision)
+        post_mod_scores = torch.where(
+            mask_mod(b, h, m, n, *mask_mod_other_buffers),
+            score_mod(scores, b, h, m, n, *score_mod_other_buffers),
+            torch.tensor(-float("inf"), dtype=working_precision, device=scores.device),
+        )
+    return scores, post_mod_scores
+def math_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Eager implementation
+    This implementation uses vmap to vectorize the score_mod function over the batch, head, m, and n dimensions.
+    We then apply the vectorized score_mod function to the scores matrix. Each wrap of vmap applies one of the
+    batch, head, m, or n dimensions. We need to apply vmap 4 times to vectorized over all 4 dimensions.
+    Args:
+        query: The query tensor
+        key: The key tensor
+        value: The value tensor
+        score_mod: The score_mod function
+        other_buffers: Other buffers that are passed to the score_mod function
+    """
+    # broadcast query & key along head dim for GQA
+    G = query.size(1) // key.size(1)
+    value = torch.repeat_interleave(value, G, dim=1)
+    key = torch.repeat_interleave(key, G, dim=1)
+    Bq, Bkv = query.size(0), key.size(0)
+    if not ((Bq == Bkv) or (Bq > 1 and Bkv == 1)):
+        raise RuntimeError(f"Bq and Bkv must broadcast. Got Bq={Bq} and Bkv={Bkv}")
+    key = key.expand((Bq, *key.size()[1:]))
+    value = value.expand((Bq, *value.size()[1:]))
+    _, post_mod_scores = _math_attention_inner(
+        query,
+        key,
+        value,
+        score_mod,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    # Set fully masked rows' sumexp to 0.0
+    logsumexp = post_mod_scores.logsumexp(dim=-1)
+    masked_rows = torch.all(post_mod_scores == -float("inf"), dim=-1)
+    logsumexp = torch.where(masked_rows, -float("inf"), logsumexp)
+    post_mod_scores = torch._safe_softmax(post_mod_scores, dim=-1)
+    return post_mod_scores.to(query.dtype) @ value, logsumexp / math.log(2)
+@flex_attention.py_impl(DispatchKey.CompositeExplicitAutograd)
+def sdpa_dense(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    out, lse = math_attention(
+        query,
+        key,
+        value,
+        score_mod,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    out = _permute_strides(out, query.stride())
+    return out, lse
+def trace_flex_attention(
+    proxy_mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Traces the flex_attention operator with the given score_mod function and other_buffers.
+    Trace SDPA will call make_fx with "fake" example vals and then trace the score_mod function
+    This will produce a GraphModule that will be stored on the root tracer as "sdpa_score". We
+    access this graph module in inductor to inline the score_mod function to the triton template.
+    """
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    example_out = flex_attention(
+        query,
+        key,
+        value,
+        score_mod,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    example_vals = [query.new_zeros((), requires_grad=query.requires_grad)] + [
+        query.new_zeros((), dtype=torch.int) for _ in range(4)
+    ]
+    mask_example_vals = [query.new_zeros((), dtype=torch.int) for _ in range(4)]
+    mask_mod = block_mask[-1]
+    with TransformGetItemToIndex():
+        score_graph = reenter_make_fx(score_mod)(
+            *example_vals, *score_mod_other_buffers
+        )
+        mask_graph = reenter_make_fx(mask_mod)(
+            *mask_example_vals, *mask_mod_other_buffers
+        )
+    assert isinstance(proxy_mode.tracer, torch.fx.Tracer)
+    block_mask = block_mask[:-1] + (mask_graph,)
+    qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_score")
+    proxy_mode.tracer.root.register_module(qualname, score_graph)
+    mask_qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_mask")
+    proxy_mode.tracer.root.register_module(mask_qualname, mask_graph)
+    node_args = (
+        query,
+        key,
+        value,
+        score_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", flex_attention, proxy_args, {}
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+@flex_attention.py_impl(ProxyTorchDispatchMode)
+def flex_attention_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert mode is not None, "Mode should always be enabled for python fallback key"
+    return trace_flex_attention(
+        mode,
+        query,
+        key,
+        value,
+        score_mod,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+@flex_attention.py_functionalize_impl
+def flex_attention_functionalize(
+    ctx: torch._subclasses.functional_tensor.BaseFunctionalizeAPI,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Defines the functionalization rules for the flex_attention operator.
+    Write now we are unwrapping each tensor and then redispatching to the next, however we want to
+    guard against any mutations in the score_mod function, to the other_buffers since those
+    are free variables.
+    """
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor, FunctionalTensor),
+    ):
+        return NotImplemented
+    query_unwrapped = ctx.unwrap_tensors(query)
+    key_unwrapped = ctx.unwrap_tensors(key)
+    value_unwrapped = ctx.unwrap_tensors(value)
+    block_mask_unwrapped = ctx.unwrap_tensors(block_mask)
+    score_mod_other_buffers_unwrapped = ctx.unwrap_tensors(score_mod_other_buffers)
+    mask_mod_other_buffers_unwrapped = ctx.unwrap_tensors(mask_mod_other_buffers)
+    # Appease the mypy overlords
+    assert isinstance(query_unwrapped, torch.Tensor)
+    assert isinstance(key_unwrapped, torch.Tensor)
+    assert isinstance(value_unwrapped, torch.Tensor)
+    assert isinstance(block_mask_unwrapped, tuple)
+    assert isinstance(score_mod_other_buffers_unwrapped, tuple)
+    assert isinstance(mask_mod_other_buffers_unwrapped, tuple)
+    example_vals = (
+        [query_unwrapped.new_zeros(())]
+        + [query_unwrapped.new_zeros((), dtype=torch.int) for _ in range(4)]
+        + list(score_mod_other_buffers_unwrapped)
+    )
+    with ctx.redispatch_to_next():
+        functional_score_mod = ctx.functionalize(score_mod)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        with TransformGetItemToIndex():
+            # TODO: So far only the input mutations are checked
+            # In the other HOPs, also aliases are checked which is
+            # omitted here
+            mutates = _has_potential_branch_input_mutation(
+                score_mod, example_vals, pre_dispatch
+            )
+        # The only care about mutations of existing buffers since we can't replay these.
+        # However, we can just error if anything is detected
+        if mutates:
+            raise UnsupportedAliasMutationException("Mutations detected in score_mod")
+        out = flex_attention(
+            query_unwrapped,
+            key_unwrapped,
+            value_unwrapped,
+            functional_score_mod,
+            block_mask_unwrapped,
+            scale,
+            kernel_options,
+            score_mod_other_buffers_unwrapped,
+            mask_mod_other_buffers_unwrapped,
+        )
+    return ctx.wrap_tensors(out)  # type: ignore[return-value, arg-type]
+@register_fake(flex_attention)
+def flex_attention_fake_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor,),
+    ):
+        return NotImplemented
+    # TODO: Figure out a better way to handle this for NJT than using sum()
+    if query.is_nested:
+        out = torch.empty_like(query, memory_format=torch.contiguous_format)
+        logsumexp = query.sum(dim=-1)
+        return out, logsumexp
+    v_head_dim = value.size(-1)
+    batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
+    logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
+    out_shape = (batch_size, num_heads, seq_len_q, v_head_dim)
+    out = query.new_empty(out_shape)
+    out = _permute_strides(out, query.stride())
+    return out, logsumexp
+# Registers dispatches for SAC
+redirect_to_mode(flex_attention, _CachingTorchDispatchMode)
+redirect_to_mode(flex_attention, _CachedTorchDispatchMode)
+# ---------------------------- Autograd Implementation ----------------------------
+def create_fw_bw_graph(
+    score_mod: Callable,
+    index_values: tuple[Tensor, Tensor, Tensor, Tensor, Tensor],
+    other_buffers: tuple[Tensor, ...],
+) -> tuple[Callable, Callable]:
+    # See Note:[HOP create fw_bw graph]
+    # All of these imports need to be here in order to avoid circular dependencies
+    from torch._dispatch.python import suspend_functionalization
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+    from torch._subclasses.functional_tensor import disable_functional_mode
+    from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            def _from_fun(
+                t: Union[Tensor, torch.SymInt, int],
+            ) -> Union[Tensor, torch.SymInt, int]:
+                if isinstance(t, torch.Tensor):
+                    return torch.empty_strided(
+                        t.size(),
+                        t.stride(),
+                        device=t.device,
+                        dtype=t.dtype,
+                        requires_grad=t.requires_grad,
+                    )
+                return t
+            # If someone runs this hop under the default compiler backend ("eager")
+            # Then this path will be run with the actual user inputs. We convert them
+            # to fake tensors in order to not perform any actual compute.
+            from torch._guards import detect_fake_mode
+            fake_mode = detect_fake_mode(index_values)
+            if fake_mode is None:
+                fake_mode = FakeTensorMode(allow_non_fake_inputs=True)
+            with fake_mode:
+                unwrapped_score_mod_indexes = pytree.tree_map(_from_fun, index_values)
+                unwrapped_other_buffers = pytree.tree_map(_from_fun, other_buffers)
+            assert all(
+                isinstance(t, (FakeTensor, int, torch.SymInt))
+                for t in unwrapped_score_mod_indexes + unwrapped_other_buffers
+            )
+            example_flat_out = pytree.tree_map(
+                _from_fun,
+                score_mod(*unwrapped_score_mod_indexes, *unwrapped_other_buffers),
+            )
+            if not isinstance(example_flat_out, torch.Tensor):
+                raise RuntimeError(
+                    "Expected output of score_mod to be a tensor."
+                    f"Got type {type(example_flat_out)}."
+                )
+            example_grad = _from_fun(example_flat_out)
+        def joint_f(
+            score: Tensor,
+            b: Tensor,
+            h: Tensor,
+            m: Tensor,
+            n: Tensor,
+            example_grad: Tensor,
+            *other_buffers: tuple[Tensor, ...],
+        ) -> tuple[Tensor, ...]:
+            def fw_with_masks(
+                *args: tuple[Tensor, ...]
+            ) -> tuple[tuple[Tensor], tuple[bool]]:
+                fw_out = score_mod(*args)
+                out_requires_grad = fw_out.requires_grad
+                return ((fw_out,), (out_requires_grad,))
+            joint = create_joint(fw_with_masks, aot_config=dummy_aot_config)
+            args = [score, b, h, m, n] + list(other_buffers)
+            optional_grad = [example_grad] if example_grad.requires_grad else []
+            _, grads = joint(args, optional_grad)
+            return grads
+        joint_graph = make_fx(joint_f)(
+            *unwrapped_score_mod_indexes, example_grad, *unwrapped_other_buffers
+        )
+        return score_mod, joint_graph
+class FlexAttentionAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        fw_graph: Callable,
+        joint_graph: Callable,
+        block_mask: tuple[Any, ...],
+        scale: float,
+        kernel_options: dict[str, Any],
+        mask_mod_other_buffers: tuple[Any, ...],
+        *score_mod_other_buffers: tuple[Any, ...],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        any_buffer_requires_grad = any(
+            buffer.requires_grad
+            for buffer in mask_mod_other_buffers
+            if isinstance(buffer, torch.Tensor)
+        )
+        assert (
+            not any_buffer_requires_grad
+        ), "Captured buffers from mask mod that require grad are not supported."
+        ctx._fw_graph = fw_graph
+        ctx._joint_graph = joint_graph
+        ctx._mask_graph = block_mask[-1]
+        ctx.scale = scale
+        ctx.kernel_options = kernel_options
+        ctx._score_mod_other_buffers_len = len(score_mod_other_buffers)
+        with torch._C._AutoDispatchBelowAutograd():
+            out, logsumexp = flex_attention(
+                query,
+                key,
+                value,
+                fw_graph,
+                block_mask,
+                scale,
+                kernel_options,
+                score_mod_other_buffers,
+                mask_mod_other_buffers,
+            )
+        save_tensors_and_symints_for_backward(
+            ctx,
+            (
+                query,
+                key,
+                value,
+                out,
+                logsumexp,
+                *block_mask[:-1],
+                *score_mod_other_buffers,
+                *mask_mod_other_buffers,
+            ),
+        )
+        return out, logsumexp
+    @staticmethod
+    def backward(ctx: Any, grad_out: Tensor, grad_logsumexp: Tensor) -> tuple[Optional[Tensor], ...]:  # type: ignore[override]
+        fw_args = saved_tensors_and_symints(ctx)
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            query_lengths,
+            kv_lengths,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+            Q_BLOCK_SIZE,
+            KV_BLOCK_SIZE,
+            *other_buffers,
+        ) = fw_args
+        fw_graph = ctx._fw_graph
+        joint_graph = ctx._joint_graph
+        mask_graph = ctx._mask_graph
+        scale = ctx.scale
+        kernel_options = ctx.kernel_options
+        score_mod_other_buffers = tuple(
+            other_buffers[: ctx._score_mod_other_buffers_len]
+        )
+        mask_mod_other_buffers = tuple(
+            other_buffers[ctx._score_mod_other_buffers_len :]
+        )
+        # We have asserted that mask_mod_other_buffers do not require grad,
+        # but score_mod_other_buffers can require grad.
+        none_grads = [None] * 6
+        (
+            grad_query,
+            grad_key,
+            grad_value,
+            grad_score_mod_captured,
+        ) = flex_attention_backward(
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            fw_graph,
+            joint_graph,
+            (
+                query_lengths,
+                kv_lengths,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+                q_num_blocks,
+                q_indices,
+                full_q_num_blocks,
+                full_q_indices,
+                Q_BLOCK_SIZE,
+                KV_BLOCK_SIZE,
+                mask_graph,
+            ),
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+        return grad_query, grad_key, grad_value, *none_grads, *grad_score_mod_captured
+# TODO: Rework DispatchKey.Autograd to py_autograd_impl
+@flex_attention.py_impl(DispatchKey.Autograd)
+def flex_attention_autograd(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple[Tensor, ...] = (),
+    mask_mod_other_buffers: tuple[Tensor, ...] = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    with TransformGetItemToIndex():
+        input_requires_grad = any(
+            isinstance(t, torch.Tensor) and t.requires_grad
+            for t in (query, key, value, *score_mod_other_buffers)
+        )
+        if torch.is_grad_enabled() and input_requires_grad:
+            example_vals = (
+                query.new_zeros((), requires_grad=input_requires_grad),
+                query.new_zeros((), dtype=torch.int),
+                query.new_zeros((), dtype=torch.int),
+                query.new_zeros((), dtype=torch.int),
+                query.new_zeros((), dtype=torch.int),
+            )
+            fw_graph, bw_graph = create_fw_bw_graph(
+                score_mod, example_vals, score_mod_other_buffers
+            )
+        else:
+            fw_graph, bw_graph = score_mod, None
+        out, logsumexp = FlexAttentionAutogradOp.apply(
+            query,
+            key,
+            value,
+            fw_graph,
+            bw_graph,
+            block_mask,
+            scale,
+            kernel_options,
+            mask_mod_other_buffers,
+            *score_mod_other_buffers,
+        )
+    return out, logsumexp
+# ---------------------------- Backward HOP Implementation ----------------------------
+@flex_attention_backward.py_impl(DispatchKey.CompositeExplicitAutograd)
+def sdpa_dense_backward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Callable,  # GraphModule type hint?
+    joint_graph: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple,
+    mask_mod_other_buffers: tuple,
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+]:
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    Bq, Hq, seq_len_q, qk_head_dim = query.shape
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.shape
+    # Get outputs before calling repeat interleave and permute to input stride orders
+    actual_grad_query = query.new_empty((Bq, Hq, seq_len_q, qk_head_dim))
+    actual_grad_query = _permute_strides(actual_grad_query, query.stride())
+    actual_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
+    actual_grad_key = _permute_strides(actual_grad_key, key.stride())
+    actual_grad_value = value.new_empty((Bq, Hkv, seq_len_kv, v_head_dim))
+    actual_grad_value = _permute_strides(actual_grad_value, value.stride())
+    def _maybe_new_buffer(
+        buffer: Union[torch.Tensor, torch.SymInt, int],
+    ) -> Optional[Union[torch.Tensor, torch.SymInt, int]]:
+        if isinstance(buffer, torch.Tensor):
+            return (
+                torch.empty_like(buffer, memory_format=torch.contiguous_format)
+                if buffer.requires_grad
+                else None
+            )
+        return buffer
+    actual_grad_score_mod_captured = [
+        _maybe_new_buffer(buffer) for buffer in score_mod_other_buffers
+    ]
+    Bq, Bkv = query.size(0), key.size(0)
+    if not ((Bq == Bkv) or (Bq > 1 and Bkv == 1)):
+        raise RuntimeError(f"Bq and Bkv must broadcast. Got Bq={Bq} and Bkv={Bkv}")
+    key = key.expand((Bq, *key.size()[1:]))
+    value = value.expand((Bq, *value.size()[1:]))
+    G = query.size(1) // key.size(1)
+    key = torch.repeat_interleave(key, G, dim=1)
+    value = torch.repeat_interleave(value, G, dim=1)
+    # We're undoing the log -> log2 change of base in the forwards
+    logsumexp = logsumexp * math.log(2)
+    # The backwards formula for the log -> log2 change of base in the forwards
+    grad_logsumexp = grad_logsumexp / math.log(2)
+    scores, post_mod_scores = _math_attention_inner(
+        query,
+        key,
+        value,
+        fw_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    masked_out_rows = logsumexp == -float("inf")
+    softmax_scores = torch.exp(post_mod_scores - logsumexp.unsqueeze(-1))
+    softmax_scores = torch.where(masked_out_rows.unsqueeze(-1), 0, softmax_scores)
+    grad_value = softmax_scores.to(query.dtype).transpose(-2, -1) @ grad_out
+    grad_softmax_scores = grad_out @ value.transpose(-2, -1)
+    sum_scores = torch.sum(out * grad_out, -1, keepdim=True)
+    grad_score_mod = softmax_scores * (
+        grad_softmax_scores - sum_scores + grad_logsumexp.unsqueeze(-1)
+    )
+    b = torch.arange(0, scores.size(0), device=scores.device)
+    h = torch.arange(0, scores.size(1), device=scores.device)
+    m = torch.arange(0, scores.size(2), device=scores.device)
+    n = torch.arange(0, scores.size(3), device=scores.device)
+    mask_graph = block_mask[-1]
+    # Gradient of the inline score_mod function, with respect to the scores
+    captured_buffers_in_dim = (None,) * len(score_mod_other_buffers)
+    out_dims = [0, None, None, None, None] + [None] * len(score_mod_other_buffers)
+    from torch.nn.attention.flex_attention import _vmap_for_bhqkv
+    # inputs are [score, b, h, q_idx, kv_idx, gradOut, ...]
+    # score and gradOut are "fully" batched
+    joint_score_mod = _vmap_for_bhqkv(
+        joint_graph,
+        prefix=(0,),
+        suffix=(0,) + captured_buffers_in_dim,
+        out_dims=out_dims,
+    )
+    with TransformGetItemToIndex():
+        grad_scores, _, _, _, _, *grad_score_mod_captured = joint_score_mod(
+            scores, b, h, m, n, grad_score_mod, *score_mod_other_buffers
+        )
+    grad_scores = grad_scores * scale
+    grad_scores = grad_scores.to(query.dtype)
+    mask_mod = _vmap_for_bhqkv(
+        mask_graph, prefix=(), suffix=(None,) * len(mask_mod_other_buffers)
+    )
+    with TransformGetItemToIndex():
+        mask_scores = mask_mod(b, h, m, n, *mask_mod_other_buffers)
+        grad_scores = torch.where(
+            mask_scores, grad_scores, torch.tensor(0, dtype=query.dtype)
+        )
+    grad_query = grad_scores @ key
+    grad_key = grad_scores.transpose(-2, -1) @ query
+    # Reduce DK, DV along broadcasted heads.
+    grad_key = grad_key.view(
+        grad_key.size(0), -1, G, grad_key.size(-2), grad_key.size(-1)
+    )
+    grad_value = grad_value.view(
+        grad_value.size(0), -1, G, grad_value.size(-2), grad_value.size(-1)
+    )
+    grad_key = torch.sum(grad_key, 2, keepdim=False)
+    grad_value = torch.sum(grad_value, 2, keepdim=False)
+    # Fill to correctly strided outputs
+    actual_grad_query.copy_(grad_query)
+    actual_grad_key.copy_(grad_key)
+    actual_grad_value.copy_(grad_value)
+    if Bq != Bkv:
+        assert (
+            Bq > 1 and Bkv == 1
+        ), f"Bq and Bkv must broadcast. Got Bq={Bq} and Bkv={Bkv}"
+        actual_grad_key = torch.sum(actual_grad_key, 0, keepdim=True)
+        actual_grad_value = torch.sum(actual_grad_value, 0, keepdim=True)
+    score_mod_other_buffer_grads = [
+        actual_grad.copy_(grad) if isinstance(actual_grad, torch.Tensor) else None
+        for actual_grad, grad in zip(
+            actual_grad_score_mod_captured, grad_score_mod_captured
+        )
+    ]
+    return (
+        actual_grad_query,
+        actual_grad_key,
+        actual_grad_value,
+        tuple(score_mod_other_buffer_grads),
+    )
+def trace_flex_attention_backward(
+    proxy_mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+]:
+    """We already have the forward graph and joint graph from the forward pass, so we create a proxy attach both graphs"""
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+    example_out = flex_attention_backward(
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    requires_grad = any(pytree.tree_map(lambda x: x.requires_grad, (query, key)))
+    fw_example_vals = [query.new_zeros((), requires_grad=requires_grad)] + [
+        query.new_zeros((), dtype=torch.int) for _ in range(4)
+    ]
+    bw_example_vals = fw_example_vals + [query.new_zeros(())]
+    mask_example_vals = [query.new_zeros((), dtype=torch.int) for _ in range(4)]
+    mask_graph = block_mask[-1]
+    with TransformGetItemToIndex():
+        # There's no active make_fx during the compiled autograd graph's initial capture
+        fw_graph = _maybe_reenter_make_fx(fw_graph)(
+            *fw_example_vals, *score_mod_other_buffers
+        )
+        joint_graph = _maybe_reenter_make_fx(joint_graph)(
+            *bw_example_vals, *score_mod_other_buffers
+        )
+        mask_graph = _maybe_reenter_make_fx(mask_graph)(
+            *mask_example_vals, *mask_mod_other_buffers
+        )
+    assert isinstance(proxy_mode.tracer, torch.fx.Tracer)
+    block_mask = block_mask[:-1] + (mask_graph,)
+    qualname = proxy_mode.tracer.get_fresh_qualname("fw_graph")
+    proxy_mode.tracer.root.register_module(qualname, fw_graph)  # type: ignore[arg-type]
+    qualname = proxy_mode.tracer.get_fresh_qualname("joint_graph")
+    proxy_mode.tracer.root.register_module(qualname, joint_graph)
+    qualname = proxy_mode.tracer.get_fresh_qualname("mask_graph")
+    proxy_mode.tracer.root.register_module(qualname, mask_graph)
+    node_args = (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        flex_attention_backward,
+        proxy_args,
+        {},
+        name="flex_attention_backward",
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+@flex_attention_backward.py_impl(ProxyTorchDispatchMode)
+def flex_attention_backward_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+]:
+    assert mode is not None, "Mode should always be enabled for python fallback key"
+    return trace_flex_attention_backward(
+        mode,
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    )
+@flex_attention_backward.py_functionalize_impl
+def flex_attention_backward_functionalize(
+    ctx: torch._subclasses.functional_tensor.BaseFunctionalizeAPI,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+]:
+    """Defines the functionalization rules for the flex_attention operator.
+    Write now we are unwrapping each tensor and then redispatching to the next,
+    since we know that the forward score mod function is assured to be free of mutations
+    to the other_buffers, we skip that mutate check and go straight to redispatching.
+    """
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor, FunctionalTensor),
+    ):
+        return NotImplemented
+    query_unwrapped = ctx.unwrap_tensors(query)
+    key_unwrapped = ctx.unwrap_tensors(key)
+    value_unwrapped = ctx.unwrap_tensors(value)
+    out_unwrapped = ctx.unwrap_tensors(out)
+    logsumexp_unwrapped = ctx.unwrap_tensors(logsumexp)
+    grad_out_unwrapped = ctx.unwrap_tensors(grad_out)
+    grad_logsumexp_unwrapped = ctx.unwrap_tensors(grad_logsumexp)
+    block_mask_unwrapped = ctx.unwrap_tensors(block_mask)
+    score_mod_other_buffers_unwrapped = ctx.unwrap_tensors(score_mod_other_buffers)
+    mask_mod_other_buffers_unwrapped = ctx.unwrap_tensors(mask_mod_other_buffers)
+    # Appease the mypy overlords
+    assert isinstance(query_unwrapped, torch.Tensor)
+    assert isinstance(key_unwrapped, torch.Tensor)
+    assert isinstance(value_unwrapped, torch.Tensor)
+    assert isinstance(out_unwrapped, torch.Tensor)
+    assert isinstance(logsumexp_unwrapped, torch.Tensor)
+    assert isinstance(grad_out_unwrapped, torch.Tensor)
+    assert isinstance(grad_logsumexp_unwrapped, torch.Tensor)
+    assert isinstance(block_mask_unwrapped, tuple)
+    assert isinstance(score_mod_other_buffers_unwrapped, tuple)
+    assert isinstance(mask_mod_other_buffers_unwrapped, tuple)
+    with ctx.redispatch_to_next():
+        functional_fw_graph = ctx.functionalize(fw_graph)
+        functional_joint_graph = ctx.functionalize(joint_graph)
+        (
+            grad_query,
+            grad_key,
+            grad_value,
+            grad_score_mod_captured,
+        ) = flex_attention_backward(
+            query_unwrapped,
+            key_unwrapped,
+            value_unwrapped,
+            out_unwrapped,
+            logsumexp_unwrapped,
+            grad_out_unwrapped,
+            grad_logsumexp_unwrapped,
+            functional_fw_graph,  # type: ignore[arg-type]
+            functional_joint_graph,  # type: ignore[arg-type]
+            block_mask_unwrapped,
+            scale,
+            kernel_options,
+            score_mod_other_buffers_unwrapped,
+            mask_mod_other_buffers_unwrapped,
+        )
+    return ctx.wrap_tensors((grad_query, grad_key, grad_value, grad_score_mod_captured))  # type: ignore[return-value,arg-type]
+@register_fake(flex_attention_backward)
+def flex_attention_backward_fake_tensor_mode(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    grad_logsumexp: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
+]:
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor,),
+    ):
+        return NotImplemented
+    Bq, _, _, qk_head_dim = query.shape
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.shape
+    grad_query = torch.empty_like(query)
+    # zeros_and_scatter creates a contiguous zeros tensor -> contiguous_format
+    grad_score_mod_captured = tuple(
+        [
+            (
+                torch.empty_like(buffer, memory_format=torch.contiguous_format)
+                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                else None
+            )
+            for buffer in score_mod_other_buffers
+        ]
+    )
+    broadcasted_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
+    broadcasted_grad_key = _permute_strides(broadcasted_grad_key, key.stride())
+    broadcasted_grad_value = value.new_empty((Bq, Hkv, seq_len_kv, v_head_dim))
+    broadcasted_grad_value = _permute_strides(broadcasted_grad_value, value.stride())
+    if Bq > 1 and Bkv == 1:
+        grad_key = torch.sum(broadcasted_grad_key, dim=0, keepdim=True)
+        grad_value = torch.sum(broadcasted_grad_value, dim=0, keepdim=True)
+    else:
+        grad_key = broadcasted_grad_key
+        grad_value = broadcasted_grad_value
+    return grad_query, grad_key, grad_value, grad_score_mod_captured
+flex_attention_backward.py_autograd_impl(
+    autograd_not_implemented(flex_attention_backward, deferred_error=True)
+)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/foreach_map.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from typing import Any, Callable
+from torch._higher_order_ops.base_hop import BaseHOP, FunctionWithNoFreeVars
+class ForeachMap(BaseHOP):
+    def __init__(self):
+        super().__init__("foreach_map")
+    def __call__(self, fn, *operands, **kwargs):  # type: ignore[override]
+        fn = FunctionWithNoFreeVars(fn)
+        return super().__call__(fn, *operands, **kwargs)
+_foreach_map = ForeachMap()
+def foreach_map(op: Callable, *operands: Any, **kwargs: dict[str, Any]):
+    from torch._dynamo.polyfills import foreach_map_fn
+    return _foreach_map(foreach_map_fn, op, *operands, **kwargs)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/hints_wrap.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# mypy: allow-untyped-defs
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    autograd_not_implemented,
+    reenter_make_fx,
+    unique_graph_id,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+# used for wrapping a function/op with context hints
+class HintsWrapper(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("hints_wrapper")
+    def __call__(self, body_fn, args, kwargs, hints):
+        r"""
+        Call implementation of hints_wrapper
+        Args:
+            body_fn (Callable): A callable function that is within the scope
+             that is being traced.
+            args (Tuple of torch.Tensor/int/float/bool): A tuple of inputs to
+             body_fn.
+            kwargs (dict): Keyword argument to the body_fn.
+            hints (dict): A dict of context hints which could be passed to
+             backend compiler.
+        """
+        if not isinstance(args, tuple):
+            raise RuntimeError(f"args must be a tuple, got {type(args)}")
+        if not all(isinstance(t, (torch.Tensor, int, float, bool)) for t in args):
+            raise RuntimeError(
+                "args must be a tuple of tensors, ints, floats, or bools, got "
+                f"{args}"
+            )
+        if not isinstance(kwargs, dict):
+            raise RuntimeError(f"kwargs must be a dict, got {type(kwargs)}")
+        if len(kwargs) > 0:
+            raise RuntimeError(
+                f"kwargs except for hints are not supported, got {kwargs}"
+            )
+        if not isinstance(hints, dict):
+            raise RuntimeError(f"hints must be a dict, got {type(hints)}")
+        for k, v in hints.items():
+            if not isinstance(k, str):
+                raise RuntimeError(f"hints key must be a str, got {k}.")
+            if not isinstance(v, (int, float, bool, str)):
+                raise RuntimeError(
+                    "hints must be a dict containing int, float, bool or str "
+                    f"value, got value {v} for key {k}."
+                )
+        return super().__call__(body_fn, args, kwargs, hints)
+hints_wrapper = HintsWrapper()
+@hints_wrapper.py_impl(DispatchKey.CompositeExplicitAutograd)
+def hints_wrapper_dense(body_fn, args, kwargs, hints):
+    return body_fn(*args, **kwargs)
+hints_wrapper.py_autograd_impl(
+    autograd_not_implemented(hints_wrapper, deferred_error=True)
+)
+@hints_wrapper.py_impl(FakeTensorMode)
+def hints_wrapper_fake_tensor_mode(mode, body_func, args, kwargs, hints):
+    flat_args = pytree.tree_leaves(args)
+    with mode:
+        return body_func(*flat_args, **kwargs)
+@hints_wrapper.py_functionalize_impl
+def hints_wrapper_functionalize(ctx, body_fn, args, kwargs, hints):
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+    unwrapped_args = ctx.unwrap_tensors(args)
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    unwrapped_hints = ctx.unwrap_tensors(hints)
+    with ctx.redispatch_to_next():
+        functional_body_fn = ctx.functionalize(body_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        _check_alias_and_mutation(
+            body_fn, unwrapped_args, "hints_wrapper", pre_dispatch
+        )
+        outputs = hints_wrapper(
+            functional_body_fn,
+            unwrapped_args,
+            unwrapped_kwargs,
+            unwrapped_hints,
+        )
+        return ctx.wrap_tensors(outputs)
+def trace_hints_wrapper(proxy_mode, hints_wrapper, body_fn, args, kwargs, hints):
+    flat_args = tuple(pytree.tree_leaves(args))
+    body_graph = reenter_make_fx(body_fn)(*flat_args, **kwargs)
+    _, body_graph_name = unique_graph_id(proxy_mode, prefix="hints_wrapper_body_graph")
+    proxy_mode.tracer.root.register_module(body_graph_name, body_graph)
+    new_args: tuple = (body_graph, flat_args, {})
+    # merge hints into kwargs
+    new_kwargs = {}
+    new_kwargs["hints"] = hints
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, new_args)
+    proxy_kwargs = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, new_kwargs)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", hints_wrapper, proxy_args, proxy_kwargs, name="hints_wrapper"
+    )
+    out = body_fn(*flat_args, **kwargs)
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@hints_wrapper.py_impl(ProxyTorchDispatchMode)
+def inner(proxy_mode, body_fn, args, kwargs, hints):
+    if proxy_mode.enable_tracing:
+        return trace_hints_wrapper(
+            proxy_mode, hints_wrapper, body_fn, args, kwargs, hints
+        )
+    else:
+        return hints_wrapper(body_fn, args, kwargs, hints)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/invoke_subgraph.py ADDED Viewed

	@@ -0,0 +1,658 @@

+# mypy: allow-untyped-defs
+import contextlib
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Optional, Union
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._dispatch.python import suspend_functionalization
+from torch._higher_order_ops.utils import (
+    _from_fun,
+    _maybe_reenter_make_fx,
+    _set_compilation_env,
+    clone_outputs_aliasing_inputs,
+    FunctionalizeCtxWrapper,
+    get_dummy_aot_autograd_config,
+    HopInstance,
+    prepare_fw_with_masks,
+    reenter_make_fx,
+    register_fake,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.functional_tensor import disable_functional_mode
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+invoke_subgraph_counter = 0
+# During the tracing of the joint graph, we construct this information. This is
+# used to filter out grad_outs/tangents in the `backward` method of
+# InvokeSubgraphAutogradOp.
+@dataclass
+class OutputMetadata:
+    num_fw_outs: Optional[int] = None
+    indexes_with_none: set[int] = field(default_factory=set)
+    indexes_with_no_grad: set[int] = field(default_factory=set)
+class InvokeSubgraphHOP(HigherOrderOperator):
+    def __init__(self) -> None:
+        # Invoke subgraph does not have any state, it is just a wrapper over a
+        # subgraph, so we can safely cache the HOP.
+        super().__init__("invoke_subgraph", cacheable=True)
+        # This is used by the fake tensor cache key validator to extract the
+        # subgraph and iterate over the nodes to find if all nodes are fake
+        # tensor cacheable.
+        self.subgraph_indexes = [
+            0,
+        ]
+    # identifier is setup by upper part of the stack. This helps us in
+    # identifying two invoke_subgraph calls have same subgraph.
+    def __call__(
+        self,
+        subgraph: Union[GraphModule, FunctionalizeCtxWrapper],
+        identifier: Optional[str],
+        *operands,
+    ):
+        assert identifier is None or isinstance(
+            identifier, str
+        ), "identifier must be a None or a string"
+        assert all(
+            isinstance(o, (torch.Tensor, int, torch.SymInt)) for o in operands
+        ), f"invoke_subgraph operands must be a list of tensors/ints/SymInts {operands}"
+        return super().__call__(subgraph, identifier, *operands)
+    def gen_schema(self, subgraph, identifier, *operands):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import (
+            check_input_alias_and_mutation_return_outputs,
+            materialize_as_graph,
+        )
+        gm: torch.fx.GraphModule = (
+            subgraph
+            if isinstance(subgraph, torch.fx.GraphModule)
+            else materialize_as_graph(subgraph, operands)
+        )
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("subgraph", gm)
+        schema_gen.add_arg("identifier", identifier)
+        (
+            _,
+            _,
+            _,
+            mutated_inputs,
+            outputs,
+        ) = check_input_alias_and_mutation_return_outputs(gm, operands)
+        for idx, arg in enumerate(operands):
+            schema_gen.add_arg(f"arg{idx}", arg, is_mutated=idx in mutated_inputs)
+        for out in outputs:
+            schema_gen.add_output(out)
+        return schema_gen.gen_schema()
+invoke_subgraph = InvokeSubgraphHOP()
+def invoke_subgraph_placeholder(func, *args, **kwargs):
+    if torch.compiler.is_dynamo_compiling():
+        # This is just a placeholder for Dynamo to replace with invoke_subgraph
+        raise RuntimeError("invoke_subgraph should not be called directly in Dynamo")
+    if torch.compiler.is_compiling():
+        # For non-strict export tracing, we still want to go through Dynamo
+        from torch._dynamo.backends.debugging import (
+            make_eager_backend_with_torch_function_mode,
+        )
+        def _invoke_subgraph_placeholder_wrapper(func, args):
+            return invoke_subgraph_placeholder(func, *args)
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit(), _temp_remove_pre_dispatch_torch_function_mode():
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                else:
+                    backend = "eager"
+                return torch.compile(
+                    _invoke_subgraph_placeholder_wrapper,
+                    backend=backend,
+                    fullgraph=True,
+                )(func, args)
+    return func(*args, **kwargs)
+def mark_compile_region(fn=None):
+    """
+    This wrapper instructs torch.compile to compile the wrapped region once and
+    reuse the compiled artifact, instead of the usual way of aggressively
+    inlining the function.
+    Under the hood, it tells TorchDynamo to use InvokeSubgraph HOP for the
+    region. For PyTorch eager, this is a no-op.
+    """
+    def wrap(func):
+        def inner(*args, **kwargs):
+            # Get the innermost function to avoid nested compile regions
+            inner_func = func
+            while hasattr(inner_func, "__marked_compile_region_fn__"):
+                inner_func = inner_func.__marked_compile_region_fn__
+            return invoke_subgraph_placeholder(inner_func, *args, **kwargs)
+        inner.__marked_compile_region_fn__ = func  # type: ignore[attr-defined]
+        return inner
+    if fn:
+        return wrap(fn)
+    else:
+        return wrap
+def get_invoke_subgraph_cache():
+    cache = None
+    if tracing_ctx := torch._guards.TracingContext.try_get():
+        cache = tracing_ctx.hop_dispatch_set_cache.get_cache(invoke_subgraph)
+    return cache
+# TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
+def trace_joint_graph(fn, fw_inputs, fw_outputs):
+    """
+    Naively trace out a joint graph. This simplifies the reconstruction of joint
+    graph in the min-cut partitioner later on.
+    """
+    from torch._functorch.aot_autograd import create_joint
+    dummy_aot_config = get_dummy_aot_autograd_config()
+    # This joint_fn is inserted as the backward graph as is. This simplifies the
+    # min-cut partitioner work later on.
+    #   Input signature - (*primals, *tangents)
+    #   Output signature - (*grads, *fw_outs)
+    # The output signature is deliberately kept grads first and fw_outs second.
+    # Having grads first makes the min-cut partitioner HOP graph stitching
+    # easier.
+    def joint_fn(*primals_and_tangents):
+        primals = primals_and_tangents[: len(fw_inputs)]
+        tangents = primals_and_tangents[len(fw_inputs) :]
+        fw_outs, grads = create_joint(
+            prepare_fw_with_masks(fn), aot_config=dummy_aot_config
+        )(primals, tangents)
+        maybe_clone = clone_outputs_aliasing_inputs(primals_and_tangents)
+        # return signature is deliberately kept (*grads, *fw_outs). This
+        # simplifies partitioning work later on.
+        return pytree.tree_map(maybe_clone, tuple(grads + list(fw_outs)))
+    primals = list(fw_inputs)
+    # This assumes that the tangent strides match fw_outputs strides. Check the
+    # InvokeSubgraphAutogradOp backward op for the contiguous call.
+    tangents = [_from_fun(out) for out in fw_outputs]
+    joint_operands = primals + tangents
+    return _maybe_reenter_make_fx(joint_fn)(*joint_operands)
+# TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
+def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            # args are functional tensors, generate some example tensors
+            fw_inputs = pytree.tree_map(_from_fun, operands)
+            from torch._guards import detect_fake_mode
+            fake_mode = detect_fake_mode(fw_inputs)
+            context = (
+                nullcontext()
+                if fake_mode is None or fake_mode.shape_env is None
+                else fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            )
+            with context:
+                fw_outs = pytree.tree_map(_from_fun, subgraph(*fw_inputs))
+            num_fw_outs = len(fw_outs)
+            # Collect the indexes of none in the output to check that the grad
+            # is None at the corresponding index in the backward. This check is
+            # performed in the autograd.Function - InvokeSubgraphAutogradOp.
+            # Also collect the indexes of no_grad in the output to filter out
+            # the grad_outs in the `backward` method.
+            output_metadata = OutputMetadata()
+            output_metadata.num_fw_outs = num_fw_outs
+            for idx, fw_out in enumerate(fw_outs):
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+                elif not fw_out.requires_grad:
+                    output_metadata.indexes_with_no_grad.add(idx)
+            if grad_outputs is None:
+                # Infer grad_outputs to be the same properties as the fw_outputs
+                # if they're not passed in
+                # Although fw_outs are equivalent to grad_outputs for tracing
+                # purposes, we have to carefully handle the None and fw_out that do
+                # not have require_grad. At those indexes, we will have None in the
+                # backward graph.
+                grad_outputs = fw_outs
+                grad_outputs = [grad for grad in grad_outputs if grad is not None]
+                grad_outputs = [grad for grad in grad_outputs if grad.requires_grad]
+                # Force grad_out to be contiguous. This is because at runtime,
+                # grad_out could have different strides than fw_outs. So, we
+                # force the grad_outs to be contiguous for both tracing and
+                # runtime.
+                grad_outputs = [grad.contiguous() for grad in grad_outputs]
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in grad_outputs
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of invoke_subgraph to only contains tensors or None. "
+                    f"Got types {[type(out) for out in grad_outputs]}."
+                )
+            # Trace the forward subgraph
+            fw_graph = _maybe_reenter_make_fx(subgraph)(*fw_inputs)
+            # Trace the joint graph and assign it to the bwd graph
+            bw_graph = trace_joint_graph(
+                subgraph,
+                fw_inputs,
+                grad_outputs,
+            )
+            return fw_graph, bw_graph, output_metadata
+def get_output_metadata(subgraph, *operands):
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            # args are functional tensors, generate some example tensors
+            fw_inputs = pytree.tree_map(_from_fun, operands)
+            from torch._guards import detect_fake_mode
+            fake_mode = detect_fake_mode(fw_inputs)
+            context = (
+                nullcontext()
+                if fake_mode is None or fake_mode.shape_env is None
+                else fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            )
+            with context:
+                fw_outs = pytree.tree_map(_from_fun, subgraph(*fw_inputs))
+            num_fw_outs = len(fw_outs)
+            # Collect the indexes of none in the output to check that the grad
+            # is None at the corresponding index in the backward. This check is
+            # performed in the autograd.Function - InvokeSubgraphAutogradOp.
+            # Also collect the indexes of no_grad in the output to filter out
+            # the grad_outs in the `backward` method.
+            output_metadata = OutputMetadata()
+            output_metadata.num_fw_outs = num_fw_outs
+            for idx, fw_out in enumerate(fw_outs):
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+                elif not fw_out.requires_grad:
+                    output_metadata.indexes_with_no_grad.add(idx)
+            return output_metadata
+def trace_joint_graph_as_bwd(
+    subgraph, num_primals, joint_operands, include_key_set, exclude_key_set
+):
+    """
+    Naively trace out a joint graph. This simplifies the reconstruction of joint
+    graph in the min-cut partitioner later on.
+    """
+    from torch._functorch.aot_autograd import create_joint
+    dummy_aot_config = get_dummy_aot_autograd_config()
+    if isinstance(subgraph, torch.fx.GraphModule):
+        def graph_with_interpreter(*args):
+            # Running graph with interpreter is needed for propagating the stack_trace
+            with torch.fx.traceback.preserve_node_meta():
+                return torch.fx.Interpreter(subgraph).run(*args)
+        fn = graph_with_interpreter
+    else:
+        fn = subgraph
+    # This joint_fn is inserted as the backward graph as is. This simplifies the
+    # min-cut partitioner work later on.
+    #   Input signature - (*primals, *tangents)
+    #   Output signature - (*grads, *fw_outs)
+    # The output signature is deliberately kept grads first and fw_outs second.
+    # Having grads first makes the min-cut partitioner HOP graph stitching
+    # easier.
+    def joint_fn(*primals_and_tangents):
+        primals = primals_and_tangents[:num_primals]
+        tangents = primals_and_tangents[num_primals:]
+        fw_outs, grads = create_joint(
+            prepare_fw_with_masks(fn), aot_config=dummy_aot_config
+        )(primals, tangents)
+        maybe_clone = clone_outputs_aliasing_inputs(primals_and_tangents)
+        # return signature is deliberately kept (*grads, *fw_outs). This
+        # simplifies partitioning work later on.
+        return pytree.tree_map(maybe_clone, tuple(grads + list(fw_outs)))
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            joint_operands = [_from_fun(arg) for arg in joint_operands]
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(
+                    torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
+                )
+                with torch.enable_grad():
+                    return _maybe_reenter_make_fx(joint_fn)(*joint_operands)
+class InvokeSubgraphAutogradOp(torch.autograd.Function):
+    """
+    Saves the subgraph, i.e. original callable, in the forward method. And then
+    traces out a joint graph in the backward. This delaying of tracing in
+    backward, also called as lazy backward, ensures that the assumptions about
+    the grad_out strides and tensor-subclass-ness are already accounted for.
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        subgraph,
+        identifier,
+        output_metadata,
+        *operands,
+    ):
+        # We want to delay the backward graph construction until the backward.
+        # So in forward, we just run the fw callable as is. And save all the
+        # information necessary to construct the backward graph in the ctx.
+        ctx._subgraph = subgraph
+        ctx._identifier = identifier
+        ctx._output_metadata = output_metadata
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        save_tensors_and_symints_for_backward(ctx, operands)
+        with torch._C._AutoDispatchBelowAutograd():
+            out = invoke_subgraph(
+                subgraph,
+                f"fw_{identifier}",
+                *operands,
+            )
+        # Check that None is at expected indexes.
+        for idx, o in enumerate(out):
+            if o is None:
+                assert idx in output_metadata.indexes_with_none
+        return out
+    @staticmethod
+    def backward(
+        ctx,
+        *grad_outs,
+    ):
+        from torch._dynamo.utils import dynamo_timed
+        subgraph = ctx._subgraph
+        identifier = ctx._identifier
+        output_metadata = ctx._output_metadata
+        primals = saved_tensors_and_symints(ctx)
+        # Filter out grads that are None or do not require_grad. This was
+        # the assumption we made during the tracing of joint_graph.
+        filtered_grad_outs = []
+        for idx, o in enumerate(grad_outs):
+            if o is None:
+                assert idx in output_metadata.indexes_with_none
+            elif idx in output_metadata.indexes_with_no_grad:
+                # Deliberately skip over the grad_outs which we know should be
+                # None because the corresponding fwd_out does not require_grad.
+                pass
+            else:
+                filtered_grad_outs.append(o)
+        filtered_grad_outs = tuple(filtered_grad_outs)
+        # Important note - Even though the forward graph can be same for
+        # different invoke_subgraphs, the backward graph can be different
+        # because the tangent strides can be different. So, here we cache on
+        # tangent_metadata in addition to identifier
+        from torch._guards import detect_fake_mode
+        from torch._subclasses._fake_tensor_utils import _CacheKeyState
+        from torch._subclasses.fake_tensor import extract_tensor_metadata
+        fake_mode = detect_fake_mode(primals + filtered_grad_outs)
+        state = _CacheKeyState(fake_mode.shape_env)
+        tangent_metadata: list[object] = []
+        for tangent in filtered_grad_outs:
+            metadata = extract_tensor_metadata(tangent)
+            metadata._flatten_into(tangent_metadata, fake_mode, state)
+        tangent_metadata = tuple(tangent_metadata)
+        # bw_graph is a joint graph with signature (*primals_and_tangents) and
+        # returns (*grads_and_fw_outs). To get the grads, we use the num_fw_outs
+        # to extract the grads.
+        primals_and_tangents = primals + filtered_grad_outs
+        # Check if we have already traced the bwd subgraph.
+        bw_graph = None
+        suffix = None
+        invoke_subgraph_cache = get_invoke_subgraph_cache()
+        cache_hit = False
+        if invoke_subgraph_cache:
+            bw_graph, suffix = invoke_subgraph_cache.get_lazy_bwd_entry(
+                identifier, tangent_metadata
+            )
+            cache_hit = bw_graph is not None
+        if bw_graph is None:
+            assert suffix is None
+            with dynamo_timed(
+                "invoke_subgraph_trace_joint_graph", log_pt2_compile_event=True
+            ):
+                bw_graph = trace_joint_graph_as_bwd(
+                    subgraph,
+                    len(primals),
+                    primals_and_tangents,
+                    ctx._fw_include_key_set,
+                    ctx._fw_exclude_key_set,
+                )
+        if invoke_subgraph_cache and not cache_hit:
+            suffix = invoke_subgraph_cache.add_lazy_bwd_entry(
+                identifier, tangent_metadata, bw_graph
+            )
+        grads = invoke_subgraph(
+            bw_graph, f"bw_{identifier}_{suffix}", *primals_and_tangents
+        )[: -output_metadata.num_fw_outs]
+        return None, None, None, *grads
+@invoke_subgraph.py_autograd_impl
+def _(subgraph, identifier, *operands):
+    # Check if we have already traced the subgraph.
+    invoke_subgraph_cache = get_invoke_subgraph_cache()
+    if invoke_subgraph_cache:
+        if saved_autograd_fn := invoke_subgraph_cache.get_autograd_key_entry(
+            identifier
+        ):
+            return saved_autograd_fn(*operands)
+    output_metadata = get_output_metadata(subgraph, *operands)
+    def autograd_fn_callable(*args):
+        return InvokeSubgraphAutogradOp.apply(
+            subgraph, identifier, output_metadata, *args
+        )
+    # Save the autograd_fn_callable in the dispatch set cache.
+    if invoke_subgraph_cache:
+        invoke_subgraph_cache.add_autograd_key_entry(identifier, autograd_fn_callable)
+    return autograd_fn_callable(*operands)
+@invoke_subgraph.py_impl(DispatchKey.CompositeExplicitAutograd)
+def _(subgraph, identifier, *operands):
+    from torch.utils._python_dispatch import _get_current_dispatch_mode
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return subgraph(*operands)
+@invoke_subgraph.py_functionalize_impl
+def _(ctx, subgraph, identifier, *operands):
+    from torch._higher_order_ops.auto_functionalize import (
+        can_auto_functionalize,
+        do_auto_functionalize_v2,
+    )
+    unwrapped_operands = ctx.unwrap_tensors(operands)
+    hop_instance = HopInstance.create(invoke_subgraph, subgraph, identifier, *operands)
+    if can_auto_functionalize(hop_instance):
+        # NOTE: [auto_functionalize x invoke_subgraph caching]
+        # We call auto_functionalized_v2 to support input mutation of invoke_subgraph.
+        # See NOTE [Support input mutation of hops] for the overall design.
+        #
+        # invoke_subgraph is special because of its identifier based caching machanism.
+        # In invoke_subgraph's functionalization key implementation, we create a new
+        # identifer because the subgraph is replaced by FunctionWithNoFreeVars in a
+        # functional + epilogue form.
+        assert isinstance(identifier, str), identifier
+        return do_auto_functionalize_v2(
+            ctx.mode,
+            hop_instance,
+            (subgraph, "auto_functionalized_" + identifier, *operands),
+            {},
+        )
+    with ctx.redispatch_to_next():
+        # NB: There is an assumption that subgraph does not mutate inputs and
+        # there is no aliasing. Its Dynamo responsibility to prevent formation
+        # of invoke_subgraph ops if input aliasing/mutation is detected.
+        functionalized_subgraph = FunctionalizeCtxWrapper(ctx, subgraph)
+        out = invoke_subgraph(functionalized_subgraph, identifier, *unwrapped_operands)
+    return ctx.wrap_tensors(out)
+# Register the hop fake fn. This will be called in the fake_tensor _dispatch_impl.
+@register_fake(invoke_subgraph)
+def _(subgraph, identifier, *operands):
+    from torch._dynamo.utils import dynamo_timed
+    with dynamo_timed("invoke_subgraph_fake_tensor", log_pt2_compile_event=True):
+        return subgraph(*operands)
+@invoke_subgraph.py_impl(ProxyTorchDispatchMode)
+def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, *operands):
+    # Check if we have already traced the subgraph.
+    graph = None
+    invoke_subgraph_cache = get_invoke_subgraph_cache()
+    if invoke_subgraph_cache:
+        graph = invoke_subgraph_cache.get_proxy_dispatch_entry(identifier)
+    if graph is None:
+        from torch._dynamo.utils import dynamo_timed
+        with dynamo_timed("invoke_subgraph_proxy_tensor", log_pt2_compile_event=True):
+            graph = reenter_make_fx(subgraph)(*operands)
+        from torch._guards import detect_fake_mode
+        fake_mode = detect_fake_mode(operands)
+        insert_deferred_runtime_asserts(
+            graph,
+            fake_mode.shape_env,
+            "invoke_subgraph_proxy_torch_dispatch_mode",
+            export=True,
+        )
+        graph.recompile()
+        assert isinstance(proxy_mode.tracer, torch.fx.Tracer)
+        if invoke_subgraph_cache:
+            invoke_subgraph_cache.add_proxy_dispatch_entry(identifier, graph)
+    node_args = (graph, identifier, *operands)
+    def _unwrap_proxy(arg):
+        if isinstance(arg, torch.fx.GraphModule):
+            # NOTE: [invoke_subgraph proxy_mode x auto_functionalize]
+            # Previously, we assumed that `invoke_subgraph` would always be traced with the same tracer.
+            # This allowed us to cache modules by their identifiers, assuming they were already registered.
+            #
+            # However, this assumption no longer holds when we auto-functionalize `invoke_subgraph`.
+            # auto_functionalize functionalizes the subgraph and wrap it with `FunctionWithNoFreeVars`.
+            # In the proxy mode implementation of `auto_functionalized_v2`, we need to materialize `FunctionWithNoFreeVars`
+            # input as a graph module. To do this, we re-trace the `invoke_subgraph` hop, which starts a new sub-tracer
+            # (see NOTE [materialize callable inputs as graph]). # When the new sub-tracer traces the `invoke_subgraph`
+            # with a previously cached identifier, the corresponding graph module might not
+            # exist as a submodule in the new tracer's root. Therefore, we register it as a submodule below.
+            #
+            # The alternative is to give a new identifer when we re-trace the invoke_subgraph but this will increase
+            # the compilatoin time, which defeats the purpose of caching.
+            registered_before = False
+            for (
+                _,
+                submod,
+            ) in proxy_mode.tracer.root.named_modules():  # type: ignore[union-attr]
+                if arg is submod:
+                    registered_before = True
+            if not registered_before:
+                qualname = proxy_mode.tracer.get_fresh_qualname("repeated_subgraph")  # type: ignore[union-attr]
+                proxy_mode.tracer.root.register_module(qualname, arg)  # type: ignore[union-attr]
+        return proxy_mode.tracer.unwrap_proxy(arg)  # type: ignore[union-attr]
+    proxy_args = pytree.tree_map(_unwrap_proxy, node_args)  # type: ignore[union-attr]
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", invoke_subgraph, proxy_args, {}
+    )
+    example_out = invoke_subgraph(graph, identifier, *operands)
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )

archive/.venv/Lib/site-packages/torch/_higher_order_ops/map.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# mypy: allow-untyped-defs
+import functools
+from typing import Callable, Union
+from typing_extensions import TypeVarTuple
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._dispatch.python import suspend_functionalization
+from torch._higher_order_ops.utils import _maybe_run_with_interpreter, reenter_make_fx
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.functional_tensor import disable_functional_mode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from .utils import (
+    _from_fun,
+    _stack_pytree,
+    _unstack_pytree,
+    clone_outputs_aliasing_inputs,
+    prepare_fw_with_masks,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+)
+class MapImpl(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("map_impl")
+    def __call__(self, *args, **kwargs):
+        return super().__call__(*args, **kwargs)
+map_impl = MapImpl()
+def create_fw_bw_graph(f, num_mapped_args, *args):
+    mapped_xs = args[:num_mapped_args]
+    pos_args = args[num_mapped_args:]
+    # See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
+            example_pos_args = [
+                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
+                for arg in pos_args
+            ]
+            example_flat_out = pytree.tree_map(
+                _from_fun, f(*example_xs, *example_pos_args)
+            )
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in example_flat_out
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of map only contains tensors or None. "
+                    f"Got types {[type(out) for out in example_flat_out]}."
+                )
+            example_grad = [_from_fun(out) for out in example_flat_out]
+            fw_graph = make_fx(f)(*example_xs, *example_pos_args)
+        from torch._functorch.aot_autograd import AOTConfig, create_joint
+        dummy_aot_config = AOTConfig(
+            fw_compiler=None,  # type: ignore[arg-type]
+            bw_compiler=None,  # type: ignore[arg-type]
+            partition_fn=None,  # type: ignore[arg-type]
+            decompositions={},
+            num_params_buffers=0,
+            aot_id=0,
+            keep_inference_input_mutations=False,
+        )
+        def joint_f(*example_args):
+            joint_mapped_args = example_args[:joint_num_mapped]
+            args = example_args[joint_num_mapped:]
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+            joint = create_joint(prepare_fw_with_masks(f), aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input) + list(args),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs
+            maybe_clone = clone_outputs_aliasing_inputs(example_args)
+            return pytree.tree_map(maybe_clone, grads)
+        joint_num_mapped = len(example_grad) + len(example_xs)
+        joint_graph = make_fx(joint_f)(*example_xs, *example_grad, *example_pos_args)
+        return fw_graph, joint_graph
+def map(
+    f: Callable[[pytree.PyTree, tuple[pytree.PyTree, ...]], pytree.PyTree],
+    xs: Union[pytree.PyTree, torch.Tensor],
+    *args: TypeVarTuple,
+):
+    r"""
+    Perfoms a map of f with xs. Intuitively, you can think of the semantic being:
+    out = []
+    for idx in len(xs.size(0)):
+        xs_sliced = xs.select(0, idx)
+        out.append(f(xs_sliced, *args))
+    torch.stack(out)
+    .. warning::
+        `torch._higher_order_ops.map` is a prototype feature in PyTorch. It currently
+        does not support autograd and you may run into miscompiles.
+        Read more about feature classification at:
+        https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+    Args:
+        f (Callable): a callable that takes an input x, that could either be a single Tensor
+            or a nested dict, list of tensors and some additional inputs
+        xs: the inputs that're to be mapped over. We'll iterate over the first dim of each x
+            and perform f on each slice.
+        *args: additional arguments provided to each step of f. They could also be omitted and
+            map is able to automatically figure out the read dependency.
+    Return:
+        the stacked output for each step of f
+    Example:
+        def f(xs):
+            return xs[0] + xs[1] + const1 + const2
+        xs = [torch.randn(2, 3), torch.randn(2, 3)]
+        const1 = torch.randn(2, 3)
+        const2 = torch.randn(2, 3)
+        # returns a tensor of shape [2, 2, 3]
+        torch._higher_order_ops.map(f, xs)
+    """
+    flat_xs, xs_spec = pytree.tree_flatten(xs)
+    flat_args, args_spec = pytree.tree_flatten(args)
+    if not all(isinstance(t, torch.Tensor) for t in flat_xs):
+        raise RuntimeError(f"Mapped xs can only consist of tensors. Got xs {flat_xs}.")
+    shapes = [xs.shape for xs in flat_xs]
+    leading_dim_size = shapes[0][0]
+    if leading_dim_size == 0:
+        raise RuntimeError("Leading dimensions of mapped xs cannot be 0.")
+    if any(cur_shape[0] != leading_dim_size for cur_shape in shapes):
+        raise RuntimeError(
+            f"Leading dimensions of mapped xs must be consistent. Got shapes {shapes}."
+        )
+    def run_flattened_map(f, flat_xs, flat_args):
+        def wrapped_fn(*flat_args, f, xs_tree_spec, args_tree_spec, num_xs):
+            xs = pytree.tree_unflatten(flat_args[:num_xs], xs_tree_spec)
+            args = pytree.tree_unflatten(flat_args[num_xs:], args_tree_spec)
+            return f(xs, *args)
+        inner_f = functools.partial(
+            wrapped_fn,
+            f=f,
+            xs_tree_spec=xs_spec,
+            args_tree_spec=args_spec,
+            num_xs=len(flat_xs),
+        )
+        return map_impl(inner_f, flat_xs, flat_args)
+    from torch._higher_order_ops.utils import _maybe_compile_and_run_fn
+    return _maybe_compile_and_run_fn(run_flattened_map, f, flat_xs, flat_args)
+class MapAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
+        save_tensors_and_symints_for_backward(ctx, flat_args)
+        ctx._joint_graph = joint_graph
+        ctx._num_mapped_args = num_mapped_args
+        with torch._C._AutoDispatchBelowAutograd():
+            return (
+                *map_impl(
+                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
+                ),
+            )
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        fw_args = saved_tensors_and_symints(ctx)
+        fw_mapped_args = fw_args[: ctx._num_mapped_args]
+        pos_args = fw_args[ctx._num_mapped_args :]
+        grads = map_impl(
+            ctx._joint_graph,
+            fw_mapped_args + flat_grads,
+            pos_args,
+        )
+        return None, None, None, *grads
+def trace_map(proxy_mode, func_overload, f, xs, pos_args):
+    example_input = _unstack_pytree(xs)[0]
+    body_graph = f
+    body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
+    next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
+    proxy_mode.tracer.root.register_module(next_name, body_graph)
+    fake_outs = map_impl(body_graph, xs, pos_args)
+    node_args = (body_graph, list(xs), list(pos_args))
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="map_impl"
+    )
+    return track_tensor_tree(
+        fake_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+@map_impl.py_impl(DispatchKey.CompositeExplicitAutograd)
+def map_dense(f, xs, pos_args):
+    pytrees = [f(*inp, *pos_args) for inp in _unstack_pytree(xs)]
+    return _stack_pytree(pytrees)
+@map_impl.py_autograd_impl
+def map_autograd(f, xs, pos_args):
+    num_mapped_args = len(xs)
+    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
+    return flat_out
+@map_impl.py_impl(ProxyTorchDispatchMode)
+def map_proxy_torch_dispatch_mode(mode, f, xs, args):
+    return trace_map(mode, map_impl, f, xs, args)
+@map_impl.py_impl(FakeTensorMode)
+def map_fake_tensor_mode(mode, f, xs, args):
+    with mode:
+        return map_dense(f, xs, args)
+@map_impl.py_functionalize_impl
+def map_functionalize(ctx, f, xs, pos_args):
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+    unwrapped_xs = ctx.unwrap_tensors(xs)
+    unwrapped_args = ctx.unwrap_tensors(pos_args)
+    wrapped_fn = ctx.functionalize(_maybe_run_with_interpreter(f))
+    with ctx.redispatch_to_next():
+        example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        _check_alias_and_mutation(f, example_inputs, "map", pre_dispatch)
+        map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
+        return ctx.wrap_tensors(map_return)
+def _fake_map(f, x, *args):
+    from functorch.experimental.control_flow import _stack_pytree, _unstack_pytree
+    x_pytrees = _unstack_pytree(x)
+    zs = []
+    for xp in x_pytrees:
+        zs.append(f(xp, *args))
+    return _stack_pytree(zs)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/out_dtype.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# mypy: allow-untyped-defs
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._prims_common import elementwise_dtypes, ELEMENTWISE_TYPE_PROMOTION_KIND
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    maybe_handle_decomp,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+# TODO to figure out a more generic approach
+ALLOWABLE_OPS = [
+    torch.ops.aten.linear.default,
+    torch.ops.aten.mm.default,
+    torch.ops.aten.conv2d.default,
+    torch.ops.aten.convolution.default,
+    torch.ops.aten.mul.Tensor,
+    torch.ops.aten.mul.Scalar,
+    torch.ops.aten.div.Tensor,
+    torch.ops.aten.div.Scalar,
+]
+class OutDtypeOperator(HigherOrderOperator):
+    """
+    The out_dtype operator takes an existing ATen functional operator, an
+    `out_dtype` argument, and arguments to the original operator, and executes
+    the original operator and returns a Tensor with the `out_dtype` precision.
+    This operator does not mandate a compute precision so it allows the
+    representation to not be opinionated about the exact implementation.
+    The general implementation for all operators will be the following:
+        1. Promote inputs dtypes based on default PyTorch dtype promotion rules,
+            using the dtypes of all input Tensors/Scalars and the `out_dtype`
+            arugument.
+        2. Execute the operator
+        3. Cast the output to `out_dtype`
+    """
+    def __init__(self) -> None:
+        super().__init__("out_dtype")
+    def __call__(self, op, output_dtype, *args):
+        if not isinstance(op, torch._ops.OpOverload):
+            raise ValueError("out_dtype's first argument must be an OpOverload")
+        if op._schema.is_mutable:
+            raise ValueError(
+                "out_dtype's first argument needs to be a functional operator"
+            )
+        if not (
+            len(op._schema.returns) == 1
+            and isinstance(op._schema.returns[0].type, torch.TensorType)
+        ):
+            raise ValueError(
+                "out_dtype's can only apply to ops that return a single tensor"
+                f"Instead got {[r.type for r in op._schema.returns]}"
+            )
+        if op not in ALLOWABLE_OPS:
+            raise ValueError(
+                f"out_dtype only allows the following operators: {ALLOWABLE_OPS}."
+            )
+        res = super().__call__(op, output_dtype, *args)
+        return res
+out_dtype = OutDtypeOperator()
+def trace_out_dtype(proxy_mode, func_overload, op, output_dtype, *args):
+    # NB: Long-term we should put the decomposition logic into
+    # ProxyTorchDispatchMode so that people do not need to call maybe_handle_decomp
+    # in all HigherOrderOp proxy implementations.
+    r = maybe_handle_decomp(proxy_mode, func_overload, (op, output_dtype, *args), {})
+    if r is not NotImplemented:
+        return r
+    with disable_proxy_modes_tracing():
+        # This is a simplified implementation of this operator just for tracing.
+        # Actual implementation may also first promote the arguments
+        out = op(*args).to(dtype=output_dtype)
+    node_args = (op, output_dtype, *args)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="out_dtype"
+    )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@out_dtype.py_impl(DispatchKey.CompositeExplicitAutograd)
+def out_dtype_dense(op: torch._ops.OpOverload, output_dtype: torch.dtype, *args):
+    if is_int_mm(op, output_dtype, args):
+        return torch._int_mm(*args)
+    return out_dtype_fallback(op, output_dtype, *args)
+def is_int_mm(op, output_dtype, args):
+    return (
+        op == torch.ops.aten.mm.default
+        and output_dtype == torch.int32
+        and len(args) == 2
+        and args[0].dtype == torch.int8
+        and args[1].dtype == torch.int8
+        and args[0].is_cuda
+        and args[1].is_cuda
+    )
+def out_dtype_fallback(op, output_dtype, *args):
+    flat_inputs = pytree.arg_tree_leaves(*args) + [torch.ones(1, dtype=output_dtype)]
+    promote_dtype: torch.dtype = elementwise_dtypes(
+        *flat_inputs,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    )[0]
+    casted_args = pytree.tree_map_only(
+        torch.Tensor, lambda arg: arg.to(dtype=promote_dtype), args
+    )
+    res = op(*casted_args).to(dtype=output_dtype)
+    return res
+out_dtype.py_autograd_impl(autograd_not_implemented(out_dtype, deferred_error=True))
+@out_dtype.py_impl(ProxyTorchDispatchMode)
+def out_dtype_proxy(
+    mode: ProxyTorchDispatchMode,
+    op: torch._ops.OpOverload,
+    output_dtype: torch.dtype,
+    *args,
+):
+    return trace_out_dtype(mode, out_dtype, op, output_dtype, *args)
+@out_dtype.py_impl(FakeTensorMode)
+def out_dtype_fake_tensor_mode(
+    mode: FakeTensorMode,
+    op: torch._ops.OpOverload,
+    output_dtype: torch.dtype,
+    *args,
+):
+    with mode:
+        return out_dtype_dense(op, output_dtype, *args)
+@out_dtype.py_functionalize_impl
+def out_dtype_func(ctx, op, output_dtype, *args):
+    unwrapped_args = tuple(ctx.unwrap_tensors(arg) for arg in args)
+    with ctx.redispatch_to_next():
+        res = out_dtype(op, output_dtype, *unwrapped_args)
+    return ctx.wrap_tensors(res)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/run_const_graph.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# mypy: allow-untyped-defs
+import torch
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils import _pytree as pytree
+class RunConstGraph(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("run_const_graph")
+    def __call__(self, graph, args):
+        return super().__call__(graph, args)
+run_const_graph = RunConstGraph()
+@run_const_graph.py_impl(ProxyTorchDispatchMode)
+def run_const_graph_dispatch_mode(mode, graph, args):
+    const_gm, weights = graph, args
+    p_args = pytree.tree_map(mode.tracer.unwrap_proxy, (graph, args))
+    assert isinstance(const_gm, torch.fx.GraphModule)
+    assert not hasattr(mode.tracer.root, "_const_graph")
+    mode.tracer.root.register_module("_const_graph", const_gm)
+    proxy = mode.tracer.create_proxy("call_function", run_const_graph, p_args, {})
+    out = const_gm(*weights)
+    return track_tensor_tree(out, proxy, constant=None, tracer=mode.tracer)
+@run_const_graph.py_functionalize_impl
+def run_const_graph_functional(ctx, graph, args):
+    unwrapped_args = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        out = run_const_graph(*unwrapped_args)
+        return ctx.wrap_tensors(out)
+run_const_graph.py_autograd_impl(
+    autograd_not_implemented(run_const_graph, deferred_error=True)
+)
+@run_const_graph.py_impl(FakeTensorMode)
+def run_const_graph_fake_tensor_mode(mode, graph, args):
+    assert isinstance(graph, torch.fx.GraphModule)
+    with mode:
+        return graph(*args)
+@run_const_graph.py_impl(DispatchKey.CPU)
+def run_const_graph_cpu(graph, args):
+    assert isinstance(graph, torch.fx.GraphModule)
+    return graph(*args)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/scan.py ADDED Viewed

	@@ -0,0 +1,929 @@

+# mypy: allow-untyped-defs
+import functools
+import itertools
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
+import torch
+import torch._prims_common as utils
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.cond import create_bw_fn
+from torch._higher_order_ops.utils import (
+    _maybe_compile_and_run_fn,
+    check_meta_consistency,
+    first_slice_copy,
+    materialize_as_graph,
+    reenter_make_fx,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+    unique_graph_id,
+    validate_subgraph_args_types,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+aten = torch._ops.ops.aten
+def wrap_combine_fn_flat(
+    *args, combine_fn, spec_init, spec_xs, num_init_leaves, num_inp_leaves
+):
+    assert len(args) == (
+        num_init_leaves + num_inp_leaves
+    ), f"Combin_fn received wrong number of arguments, expected {num_init_leaves + num_inp_leaves}, but got {len(args)}"
+    carry = pytree.tree_unflatten(args[:num_init_leaves], spec_init)
+    xs = pytree.tree_unflatten(args[num_init_leaves:], spec_xs)
+    return combine_fn(carry, xs)
+def _extract_carry_and_out(flat_out: list[Any], num_carry: int):
+    return split_into_chunks(flat_out, [num_carry, len(flat_out) - num_carry])
+# We also do a clone with contiguous_format. This is to be consistent with
+# eager semantic of scan, which stacks the outputs. The result is contiguous
+# as a result of the stack operation.
+def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
+    return (
+        y.unsqueeze(0)
+        .repeat(*([scan_length] + [1] * y.ndim))
+        .clone(memory_format=torch.contiguous_format)
+    )
+# NOTE: These functions can be reused in associative_scan and eventually moved to
+# torch._higher_order_ops.utils
+def get_tensor_mask(tensor_list: list[Any]) -> list[bool]:
+    # Returns a mask whether a list element is a tensor or not
+    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+def mask_list(
+    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
+) -> list[Any]:
+    # Masks elements on an `inp` list.
+    # If other is None, then the elements of the `inp` list where the mask is False are removed
+    # If other is not None, then the elements of the `inp` list where the mask is False are
+    # replaced with the elements of the `other` list
+    assert len(mask) == len(
+        inp
+    ), "The length of the mask needs to be identical to the length of the input"
+    if other is not None:
+        assert len(inp) == len(
+            other
+        ), "If an input and an other list is provided, they need to have the same length"
+        return [i if m else o for m, i, o in zip(mask, inp, other)]
+    else:
+        return [i for m, i in zip(mask, inp) if m]
+def first_slice_copy_with_grad(li: list[Any]) -> list[Any]:
+    # First_slice_copy does not keep the original requires_grad flag,
+    # but we need it for materialize_as_graph
+    # in order to compute the correct gradients
+    # The reason why first_slice_copy doesn't keep requires_grad flag is
+    # because it's called in torch.autograd.Function.backward/forward.
+    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
+    return slc
+def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
+    it = iter(iterable)
+    assert sum(chunk_sizes) == len(
+        iterable
+    ), "the sum of all chunks needs to match the length of the iterable."
+    return [list(itertools.islice(it, size)) for size in chunk_sizes]
+def call_operator(operator, *args):
+    return pytree.tree_leaves(operator(*args))
+def scan(
+    combine_fn: Callable[
+        [pytree.PyTree, pytree.PyTree], tuple[pytree.PyTree, pytree.PyTree]
+    ],
+    init: pytree.PyTree,
+    xs: pytree.PyTree,
+    *,
+    dim: int = 0,
+    reverse: bool = False,
+) -> tuple[pytree.PyTree, pytree.PyTree]:
+    r"""
+    Performs an inclusive scan with a combine function.
+    .. warning::
+        `torch.scan` is a prototype feature in PyTorch. It currently
+        does not support autograd and you may run into miscompiles.
+        Read more about feature classification at:
+        https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+    Args:
+        combine_fn (Callable): A binary callable with type ``(Tensor, Tensor) -> (Tensor, Tensor)``,
+            or if xs is a pytree ``(pytree, pytree) -> (pytree, pytree)``.
+            The first input to ``combine_fn`` is the previous or initial scan carry
+            and the second input element to ``combine_fn`` is a slice of the input along dim.
+            The first output element of ``combine_fn`` is the next scan carry
+            and the second output  of ``combine_fn`` represents a slice of the output.
+            This function must be pure, i.e., no lifted arguments are supported at the moment
+            and may not have any side effects.
+        init (torch.Tensor or pytree with tensor leaves): The inital scan carry, a tensor, or nested pytree of tensors.
+            The ``init`` is expected to have the same pytree structure as the first output element (i.e. carry)
+            of ``combine_fn``.
+        xs (torch.Tensor or pytree with tensor leaves): The input tensor, or nested pytree of tensors.
+    Kwargs:
+        dim (int): the dimension to scan over, default 0.
+        reverse (bool): A boolean stating if the scan should be reversed with respect to ``dim``, default ``False``.
+    Returns:
+        final_carry (torch.Tensor or pytree with tensor leaves),
+            the final carry of the scan operation with same pytree structure as init.
+        out (torch.Tensor or pytree with tensor leaves),
+            each tensor leaf is a stacked output along first dim, where each slice is the output of a scan iteration.
+    Restrictions:
+        - The combine_fn shouldn't have any aliasing between input-input, input-output, and output-output. E.g. return a view
+            or the same tensor as input is not supported. As a workaround, can clone the output to avoid aliasing.
+        - The combine_fn shoudn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
+            if you input mutation support for training is needed.
+        - The combine_fn's init carry should match the next_carry in pytree structure and in tensor metadata.
+    Example::
+        def add(x: torch.Tensor, y: torch.Tensor):
+            next_carry = y = x + y
+            # clone the output to avoid output-output aliasing
+            return next_carry, y.clone()
+        i0 = torch.zeros(1)
+        xs = torch.arange(5)
+        # returns torch.tensor([10.]), torch.tensor([[0], [1.], [3.], [6.], [10.]])
+        last_carry, cumsum = scan(add, init=i0, xs=xs)
+    """
+    # The reason we flatten init and xs before calling into dynamo is that
+    # we want to create a consistent input ordering for combine_fn
+    # and we also want to the input ordering matches the output ordering.
+    leaves_init, spec_init = pytree.tree_flatten(init)
+    leaves_xs_orig, spec_xs = pytree.tree_flatten(xs)
+    # Shortcut if no xs is provided
+    if len(leaves_xs_orig) == 0:
+        return init, []
+    def _validate_input(cfn, lxs, linit, d, r):
+        # Basic arguments check
+        if not callable(cfn):
+            raise RuntimeError("Combine_fn must be a callable, but got {cfn}")
+        if not isinstance(d, int):
+            raise RuntimeError("Dim must be an int, but got " + str(type(d)))
+        if not isinstance(r, bool):
+            raise RuntimeError("Reverse must be a bool, but got " + str(type(r)))
+        # Checks for init
+        if len(linit) == 0:
+            raise RuntimeError("scan() operator requires init leaves.")
+        for x in linit:
+            if not isinstance(x, torch.Tensor):
+                raise RuntimeError(f"All init leaves must be a Tensor but got {x}")
+        # Checks for xs
+        for x in lxs:
+            if not isinstance(x, torch.Tensor):
+                raise RuntimeError(f"All xs leaves must be a Tensor but got {x}")
+        if any(x.ndim <= d for x in lxs):
+            raise RuntimeError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+        if any(x.shape[d] == 0 for x in lxs):
+            raise RuntimeError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+    ndim = leaves_xs_orig[0].ndim
+    dim = utils.canonicalize_dim(ndim, dim)
+    _validate_input(combine_fn, leaves_xs_orig, leaves_init, dim, reverse)
+    # Move scan dim to 0 and always perform scan on dim 0
+    leaves_xs = []
+    for elem in leaves_xs_orig:
+        leaves_xs.append(torch.movedim(elem, dim, 0))
+    if reverse:
+        leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
+    # TODO: Support _inductor lowering
+    # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
+    combine_fn = functools.partial(
+        wrap_combine_fn_flat,
+        combine_fn=combine_fn,
+        spec_init=spec_init,
+        spec_xs=spec_xs,
+        num_init_leaves=len(leaves_init),
+        num_inp_leaves=len(leaves_xs),
+    )
+    def run_flattened_scan(combine_fn, leaves_init, leaves_xs):
+        return scan_op(combine_fn, leaves_init, leaves_xs, additional_inputs=())
+    carry, out = _maybe_compile_and_run_fn(
+        run_flattened_scan,
+        combine_fn,
+        leaves_init,
+        leaves_xs,
+    )
+    if reverse:
+        out = pytree.tree_map(lambda elem: elem.flip([0]), out)
+    return carry, out
+class ScanOp(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("scan")
+    def __call__(self, combine_fn, init, xs, additional_inputs):
+        # There is currently an issue that the ScanOp is sometimes called with
+        # the additional_inputs being a list. See https://github.com/pytorch/pytorch/issues/145785
+        # Once this issue is resolved, the assertion should only allow tuples
+        # and the tuple cast should be removed
+        assert isinstance(
+            additional_inputs, (tuple, list)
+        ), "additional_inputs must be a tuple."
+        additional_inputs = (
+            tuple(additional_inputs)
+            if isinstance(additional_inputs, list)
+            else additional_inputs
+        )
+        validate_subgraph_args_types(additional_inputs)
+        return super().__call__(combine_fn, init, xs, additional_inputs)
+scan_op = ScanOp()
+def generic_scan(operator, init, xs, dim=0, additional_inputs=()):
+    def _scan(init, xs):
+        """Perform scan on `elems` using `elems_init."""
+        carry = init
+        if len(xs) == 0:
+            return carry, []
+        num_elems = xs[0].shape[dim]
+        ind = 0
+        # Compute dummy shapes for the pre-allocation
+        num_init_leaves = len(init)
+        dummy_carry, dummy_out = _extract_carry_and_out(
+            call_operator(
+                operator,
+                *carry,
+                *[first_slice_copy(elem, dim) for elem in xs],
+                *additional_inputs,
+            ),
+            num_init_leaves,
+        )
+        out_tensor_mask = get_tensor_mask(dummy_out)
+        dummy_out_masked = mask_list(out_tensor_mask, dummy_out)
+        # Pre-alocate
+        # outs -> Output matrix
+        # idxs -> Index matrix for scatter_
+        # out: (num_elems, M, N, ...)
+        # idx: (1, M, N)
+        outs = [
+            torch.zeros(
+                [num_elems] + list(e.size()),
+                dtype=e.dtype,
+                device=e.device,
+            )
+            for i, e in enumerate(dummy_out_masked)
+        ]
+        idxs = [
+            torch.ones_like(e, dtype=torch.int64).unsqueeze(0)
+            for i, e in enumerate(dummy_out_masked)
+        ]
+        def store_out_in_outs(out, ind):
+            # Store the intermediate out in the outs matrix
+            for o, x, idx in zip(outs, out, idxs):
+                # o: (num_elems, M, N ...)
+                # x: (M, N, ...) -> (1, M, N)
+                # ind * idx: (1, M, N,) with values to be ind
+                # essentially: o[ind][n][k] = x[0][n][k]
+                o.scatter_(0, ind * idx, x.unsqueeze(0))
+        for i in range(num_elems):
+            ind = i
+            carry, out = _extract_carry_and_out(
+                call_operator(
+                    operator,
+                    *carry,
+                    *[elem.select(dim, ind) for elem in xs],
+                    *additional_inputs,
+                ),
+                num_init_leaves,
+            )
+            # Store the inits in the outs matrix.
+            store_out_in_outs(mask_list(out_tensor_mask, out), ind)
+        # Expand outs with None depending on the tensor mask of the output
+        outs_expanded = [outs.pop(0) if out_m else None for out_m in out_tensor_mask]
+        return [*carry, *outs_expanded]
+    scans = _scan(init, xs)
+    return scans
+def trace_scan(
+    proxy_mode,
+    func_overload,
+    combine_fn: Callable,
+    init: list[torch.Tensor],
+    xs: list[torch.Tensor],
+    additional_inputs: tuple[torch.Tensor],
+):
+    from torch._dynamo.utils import clone_input
+    with disable_proxy_modes_tracing():
+        sample_inits = [clone_input(x_init) for x_init in init]
+        sample_inputs = [first_slice_copy(x) for x in xs]
+        sample_additional_inputs = [
+            clone_input(x) if isinstance(x, torch.Tensor) else x
+            for x in additional_inputs
+        ]
+        combine_graph = reenter_make_fx(combine_fn)(
+            *sample_inits, *sample_inputs, *sample_additional_inputs
+        )
+    outputs = None
+    for node in combine_graph.graph.nodes:
+        if node.op == "output":
+            assert outputs is None
+            assert len(node.args) == 1
+            outputs = node.args[0]
+    assert outputs is not None
+    carry, output = _extract_carry_and_out(outputs, len(init))
+    init_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        i.clone() for i in init
+    ]
+    carry_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        c.meta["val"] for c in carry
+    ]
+    check_meta_consistency(
+        init_fake_tensors, carry_fake_tensors, "init", "carry", include_contiguity=False
+    )
+    _, combine_graph_name = unique_graph_id(proxy_mode, prefix="scan_combine_graph")
+    proxy_mode.tracer.root.register_module(combine_graph_name, combine_graph)
+    args = (combine_graph, init, xs, additional_inputs)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="scan"
+    )
+    with disable_proxy_modes_tracing():
+        scan_length = xs[0].shape[0]
+        fake_carry, fake_outputs = _extract_carry_and_out(
+            [o.meta["val"] for o in outputs], len(init)
+        )
+        out = (
+            *fake_carry,
+            *(stack_y(t, scan_length) for t in fake_outputs),
+        )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@scan_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def scan_op_dense(combine_fn, init, xs, additional_inputs):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return generic_scan(combine_fn, init, xs, additional_inputs=additional_inputs)
+class ScanAutogradOp(torch.autograd.Function):
+    """
+    Example ::
+        def combine_fn(x: torch.Tensor, y: torch.Tensor):
+            next_carry = y = x * y
+            return next_carry, y
+        The ``combine_fn_bw``, computing the gradients for x and y of ``combine_fn`` is computed as:
+        def combine_fn_bw(x: torch.Tensor, y: torch.Tensor, g_carry: torch.Tensor, g_y: torch.Tensor):
+            return g_y * y + g_carry * y, g_y * x + g_carry * x
+        Note: In a real usecase of scan, there may be additional_inputs that participate in the
+        forward as well as in the backward of the scan operator. For the sake of readability those inputs
+        have been omitted in the following example, but are included in the subsequent detailed description below
+        The forward output of scan is computed as:
+        carry, ys = scan(combine_fn, init, xs).
+        This computation can be unpacked as
+        c_0, ys_0 = combine_fn(init, xs_0)
+        c_1, ys_1 = combine_fn(carry_0, xs_1)
+        c_2, ys_2 = combine_fn(carry_1, xs_2)
+        ...
+        c_T, ys_T = combine_fn(carry_(T-1), xs_T)
+        We collect c_0, c_1, ..., c_T into a vector of carries that we save for the backward,
+        but we only output (c_T, ys),
+        where ys is the vector of all intermediate outputs [y_0, y_1, ..., y_T].
+        Given the carries and the ys, the gradients for xs and for init can be computed as follows:
+        We receive the upstream gradients in torch.autograd.Function, i.e., we get g_c_T and g_ys,
+        where g_ys is the vector of all intermediate gradients of the outputs [g_ys_0, g_ys_1, ..., g_ys_T]
+        We then proceed to compute the gradients for the init (g_init) and the xs (g_xs) by running a
+        scan operation reverse over time. For example,
+        g_c_(T-1), g_xs_T = combine_fn_bw(c_(T-1), xs_T, g_c_T, g_ys_T)
+        g_c_(T-2), g_xs_(T-1) = combine_fn_bw(c_(T-2), xs_(T-1), g_c_(T-1), g_ys_(T-1))
+        g_c_(T-3), g_xs_(T-2) = combine_fn_bw(c_(T-3), xs_(T-2), g_c_(T-2), g_ys_(T-2))
+        ...
+        g_init, g_xs_1 = combine_fn_bw(c_0, xs_1, g_c_0, g_ys_1)
+        0     , g_xs_0 = combine_fn_bw(init, xs_0, g_init, g_ys_0),
+        where combine_fn_bw takes the forward inputs of step t (i.e. c_(t-1), xs_t),
+        the gradients of the carry of step t (i.e. g_c_t) and
+        the upstream gradient of the output of step t (i.e. g_ys_T)
+        and returns the gradient of xs_t -> g_xs_t, as well as the gradient for the carry of step t-1 -> g_c_(t-1).
+        Through this procedure we end up with the
+        gradients for the init -> g_init,
+        the gradients for the xs -> g_xs.
+    NOTE: [scan autograd implementation]
+    The forward of scan can be computed as:
+    1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``:
+    To use a scan operation for the backward path as well, we need access to the carries from all steps.
+    Thus, the function ``combine_fn`` is wrapped such that it returns all carries and not only the last carry.
+    In particular, we define ``combine_fn_with_carry_checkpoint``:
+    def combine_fn_with_carry_checkpoint(x: torch.Tensor, y: torch.Tensor):
+        carry, y = combine_fn(x, y)
+        return carry, (carry, y)
+    The scan operator will stack all outputs along the scan dimension.
+    Thus, by putting next_carry also into outputs of ``combine_fn_with_carry_checkpoint``,
+    the carries from all steps will be stacked and hence gives us chekpointed_carries
+    2.) Compute all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``:
+    c_T, (carries, ys) = scan_op(combine_fn_with_carry_checkpoint, init, xs, additional_inputs),
+    Where c_T (last carry) and ys (all outputs) are the original results of scan with the ``combine_fn``.
+    However, carries are checkpointed carries from all steps.
+    As a result of the forward, only the last carry c_T and the ys are returned,
+    while all carries are saved for the backward.
+    The backward of scan can be computed as:
+    3.) Prepare the backward graph:
+    We prepare the backward graph to be used in the backward function.
+    We utilize ``create_bw_fn`` to generate the joint function, i.e.,
+    ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands), where fw_operands = [init, xs_0, additional_inputs]
+    The ctx._combine_fn_bw requires the primals (operands)
+    followed by the tangents (upstream gradients) from a single step
+    and produces the gradients of that step, i.e.,
+    g_c_(T-1), g_xs_T, g_additional_input_T = ctx._combine_fn_bw(c_(T-1), xs_T, additional_inputs, g_c_T, g_ys_T).
+    4.) Create a wrapper of the ``combine_fn_bw``, i.e., ``combine_fn_bw_grad_accumulation``:
+    In the forward, there may be additional inputs that participate in every forward step.
+    The gradients for those additional inputs are also computed at every step and need to be accumulated over all steps,
+    which is taken care of in this wrapper. For example:
+    def combine_fn_bw_grad_accumulation(*args):
+        carried_g_additional_input = args[:num_additional_inputs]
+        inputs_bw_fn = args[num_additional_inputs:]
+        g_c_(t-1), g_xs_t, g_additional_input_t = ctx._combine_fn_bw(*inputs_bw_fn)
+        new_g_additional_inputs = carried_g_additional_input + g_additional_input_t
+        # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+        # The ``g_xs_t`` is encoded as the output of the backward scan operator
+        return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+    5.) Perform the backward scan as
+    g_additional_inputs, g_init, g_xs = scan_op(combine_fn_bw_grad_accumulation, bw_init, bw_xs), where
+    bw_init consists of the initial gradient carry for the additional_inputs (initialized with 0s):
+    initial_g_additional_inputs, and the gradient of the last carry: g_c_T. Thus:
+    bwd_init = [*initial_g_additional_inputs, *g_c_T].
+    bw_xs consists of the combination of the upstream gradients g_ys,
+    the forward carries prepended with the fw_init, i.e., bw_carries = concat([fw_init, fw_carries[:-1]]) and
+    the fw_xs. In particular,
+    bwd_xs = [*g_ys, *bw_carries, *fw_xs].
+    Note: g_c_T and g_ys are provided through the torch.autograd.Function.backward's input
+    As demonstrated in the Example above, this backward scan then yields the gradient for the init -> g_init
+    and the gradient for the xs -> g_xs
+    NOTE: [scan partial grad handling]
+    If any element of init, of xs, of the outputs or of the additional_inputs does not require gradients,
+    i.e., requires_grad=False, there will be still gradients returned for those elements,
+    but those gradients will be a tensor filled with zeros of the same shape as the element itself.
+    A special case are additional_inputs that are not tensors. Such inputs can occur for example with symbolic tracing,
+    where the shape symbol (SymInt) becomes an additional_input.
+    For such cases, we compute a ``additional_inputs_tensor_mask``, which is True for elements of additional_inputs
+    that are tensors and False otherwise. Gradients of additional_inputs are only accumulated if this mask is True,
+    otherwise, the value of initial_g_additional_inputs is passed, which is None for non-Tensor values.
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *operands,
+    ):
+        ctx._num_leaves_init = num_leaves_init
+        ctx._num_leaves_xs = num_leaves_xs
+        ctx._num_additional_inputs = num_additional_inputs
+        ctx._combine_fn = combine_fn
+        init, xs, additional_inputs = split_into_chunks(
+            operands, [num_leaves_init, num_leaves_xs, num_additional_inputs]
+        )
+        additional_inputs_tensor_mask = get_tensor_mask(additional_inputs)
+        ctx._additional_inputs_tensor_mask = additional_inputs_tensor_mask
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        # 1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``
+        # The wrapper of the forward graph returns carries from all iterations,
+        # not just from the last iteration. These are required in the backward path
+        def combine_fn_with_carry_checkpoint(*args):
+            carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init)
+            return [
+                *carry,
+                # We additionally checkpoint all the intemediate carry outputs for backward.
+                *[
+                    n_c.clone().detach() if isinstance(n_c, torch.Tensor) else n_c
+                    for n_c in carry
+                ],
+                *y,
+            ]
+        with torch._C._AutoDispatchBelowAutograd():
+            # 2.) Compute the all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``
+            c_T, carries_ys = _extract_carry_and_out(
+                scan_op(
+                    combine_fn_with_carry_checkpoint,
+                    init,
+                    xs,
+                    additional_inputs,
+                ),
+                num_leaves_init,
+            )
+            # Collect the carries for each time step from the outs
+            # and save them for the backward path
+            carries = list(carries_ys[:num_leaves_init])
+            ys = list(carries_ys[num_leaves_init:])
+            save_tensors_and_symints_for_backward(ctx, list(operands) + carries + ys)
+            ctx._num_leaves_ys = len(ys)
+            return (*c_T, *ys)
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        r"""
+        This function computes the gradients of the scan operation.
+        It does so by using a scan operator using all carries and the upstream gradients (see description above)
+        Args:
+            flat_grads (torch.Tensor): The tensor of flattened upstream gradients.
+        """
+        # Collect the saved items from the forward
+        num_leaves_init = ctx._num_leaves_init
+        num_leaves_xs = ctx._num_leaves_xs
+        num_leaves_ys = ctx._num_leaves_ys
+        num_additional_inputs = ctx._num_additional_inputs
+        additional_inputs_tensor_mask = ctx._additional_inputs_tensor_mask
+        def prepend_init_to_carries(init, carries):
+            # Prepare the carries for the backward path.
+            # This requires to concatenate the init and the carries
+            return [
+                torch.cat([torch.unsqueeze(i, 0), c[:-1]], dim=0)
+                for i, c in zip(init, carries)
+            ]
+        def initialize_g_additional_inputs(
+            additional_inputs,
+        ):
+            # The initial gradients for the additional_inputs are all zeros
+            g_additional_inputs = [
+                torch.zeros_like(ai) if ai_tm else None
+                for ai_tm, ai in zip(additional_inputs_tensor_mask, additional_inputs)
+            ]
+            return g_additional_inputs
+        # Retrieve the forward inputs and the forward outputs and dissect them
+        flat_args = saved_tensors_and_symints(ctx)
+        fw_init, fw_xs, additional_inputs, fw_carries, fw_ys = split_into_chunks(
+            flat_args,
+            [
+                num_leaves_init,
+                num_leaves_xs,
+                num_additional_inputs,
+                num_leaves_init,
+                num_leaves_ys,
+            ],
+        )
+        # 3.) Prepare the backward graph
+        fw_operands = (
+            *fw_init,
+            *[first_slice_copy(xs) for xs in fw_xs],
+            *additional_inputs,
+        )
+        ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands)
+        # 4.) Create the BW wrapper to accumulate the gradients for the additional_inputs
+        def combine_fn_bw_grad_accumulation(*args):
+            # Dissect args and re-order them for the ``ctx._combine_fn_bw``
+            # The content of ``combine_fn_bw_tangents`` is [*carries_g, *outs_g]
+            # The content of ``combine_fn_bw_primals`` is [*init, *xs, *additional_inputs]
+            (
+                carried_g_additional_input,
+                combine_fn_bw_tangents,
+                combine_fn_bw_primals,
+            ) = split_into_chunks(
+                args,
+                [
+                    num_additional_inputs,
+                    num_leaves_init + num_leaves_ys,
+                    num_leaves_init + num_leaves_xs + num_additional_inputs,
+                ],
+            )
+            combine_fn_bw_args = (*combine_fn_bw_primals, *combine_fn_bw_tangents)
+            g_c_t, g_xs_t, g_additional_inputs_t = split_into_chunks(
+                ctx._combine_fn_bw(*combine_fn_bw_args),
+                [num_leaves_init, num_leaves_xs, num_additional_inputs],
+            )
+            new_g_additional_inputs = [
+                # If the additional inputs are ints or SymInts, those values are taken as is and no gradients are added
+                carr_g + curr_g if add_inp_tm else carr_g
+                for add_inp_tm, carr_g, curr_g in zip(
+                    additional_inputs_tensor_mask,
+                    carried_g_additional_input,
+                    g_additional_inputs_t,
+                )
+            ]
+            # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+            # The ``g_xs_t`` is encoded as the output of the backward scan operator
+            return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+        # Materialize the ``combine_fn_bw_grad_accumulation``
+        def construct_args_single_step_bw():
+            # This function constructs the arguments for a single step of the backward scan.
+            # In other words, it creates the arguments for ``combine_fn_bw_grad_accumulation``
+            # The order of the arguments returned is identical to the order the backward scan
+            # operations provides
+            # The following arguments are used for the backward part of the joint graph
+            # The first argument relates to the gradient accumulation of the additional inputs.
+            # Because only tensor elements of additional inputs can have requires_grad=True,
+            # the values for non-tensor elements of additional inputs are None
+            masked_additional_inputs = [
+                a.clone() if add_inp_tm else None
+                for add_inp_tm, a in zip(
+                    additional_inputs_tensor_mask, additional_inputs
+                )
+            ]
+            # The second argument relates to the gradients of the carries.
+            # Because the arguments are for a single step only,
+            # only the first slice of the carries is used.
+            sliced_carries = [first_slice_copy(c) for c in fw_carries]
+            # The third argument relates to the gradients of the ys.
+            # Because the arguments are for a single step only,
+            # only the first slice of the ys is used.
+            sliced_ys = [first_slice_copy(o) for o in fw_ys]
+            # The following arguments are used for the forward part of the joint graph
+            # The fourth argument relates to the init for the forward.
+            # I.e., fw_init
+            # The fifth argument relates to the xs for the forward.
+            # Because the arguments are for a single step only,
+            # only the first slice of the xs is used.
+            # Note: It is important to preserve the requires_grad flag of xs
+            # and thus we use the wrapper function ``first_slice_copy_with_grad``
+            fw_xs_slice = first_slice_copy_with_grad(fw_xs)
+            # The last argument relates to the additional inputs for the forward.
+            # I.e., additional_inputs
+            return (
+                *masked_additional_inputs,
+                *sliced_carries,
+                *sliced_ys,
+                *fw_init,
+                *fw_xs_slice,
+                *additional_inputs,
+            )
+        args_single_step_bw = construct_args_single_step_bw()
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint function when torch.compile torch.autograd.grad.
+        combine_fn_bw_grad_accumulation_gm = materialize_as_graph(
+            combine_fn_bw_grad_accumulation,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+        # Decompose the flat_grads into g_c_T, g_ys
+        g_c_T, g_ys = split_into_chunks(flat_grads, [num_leaves_init, num_leaves_ys])
+        # Initialize the g_additional_inputs with zero-tensors.
+        # This step is necessary because the gradients of the additional inputs are accumulated in the
+        # ``wrapper_bwd_combine_fn`` and thus need a zero-initialized starting point
+        initial_g_additional_inputs = initialize_g_additional_inputs(additional_inputs)
+        # Prepend the inits to the carries.
+        # This is needed, because when computing the gradients, the last carry is not needed
+        # but the first carry, the init, is required.
+        bw_carries = prepend_init_to_carries(fw_init, fw_carries)
+        # Prepare the xs for the backward scan.
+        bwd_xs = [*g_ys, *bw_carries, *fw_xs]
+        # The flipping of the ``bwd_xs`` is necessary because the scan_op in the backward is always performed in reverse
+        bwd_xs = [torch.flip(elem, [0]) for elem in bwd_xs]
+        # Prepare the bwd_init
+        bwd_init = [*initial_g_additional_inputs, *g_c_T]
+        # 5.) Perform the backwrad scan:
+        # The ``combine_fn_bw_wrapped`` receives the
+        # initial_g_additional_inputs and the last carry as the ``bwd_init`` and the
+        # gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs``
+        gradients = scan_op(
+            combine_fn_bw_grad_accumulation_gm,
+            bwd_init,
+            bwd_xs,
+            additional_inputs,
+        )
+        # Unpack the computed gradients
+        g_additional_inputs, g_init, g_xs = split_into_chunks(
+            gradients, [num_additional_inputs, num_leaves_init, num_leaves_xs]
+        )
+        # The flipping back along the scan dimension is required to get the gradients in the right order for ``xs``
+        g_xs = [torch.flip(elem, [0]) for elem in g_xs]
+        return *[None] * 4, *g_init, *g_xs, *g_additional_inputs
+@scan_op.py_autograd_impl
+def scan_autograd(combine_fn, init, xs, additional_inputs):
+    num_leaves_init = len(init)
+    num_leaves_xs = len(xs)
+    num_additional_inputs = len(additional_inputs)
+    flat_out = ScanAutogradOp.apply(
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *(tuple(init) + tuple(xs) + additional_inputs),
+    )
+    return *flat_out[:num_leaves_init], *flat_out[num_leaves_init:]
+@scan_op.py_impl(ProxyTorchDispatchMode)
+def scan_proxy_mode(mode, combine_fn, init, xs, additional_inputs):
+    return trace_scan(mode, scan_op, combine_fn, init, xs, additional_inputs)
+@scan_op.py_impl(FakeTensorMode)
+def scan_fake_tensor_mode(mode, combine_fn, init, xs, additional_inputs):
+    with mode:
+        scan_length = xs[0].shape[0]
+        carry, outputs = _extract_carry_and_out(
+            combine_fn(
+                *init,
+                *[first_slice_copy(inp) for inp in xs],
+                *additional_inputs,
+            ),
+            len(init),
+        )
+        out = (
+            *carry,
+            *(stack_y(t, scan_length) for t in outputs),
+        )
+        return out
+@scan_op.py_functionalize_impl
+def scan_functionalize(ctx, combine_fn, init, xs, additional_inputs):
+    from torch._higher_order_ops.utils import (
+        _check_alias_and_mutation,
+        _maybe_run_with_interpreter,
+    )
+    unwrapped_xs = ctx.unwrap_tensors(xs)
+    unwrapped_init = ctx.unwrap_tensors(init)
+    unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
+    with ctx.redispatch_to_next():
+        functional_combine_fn = ctx.functionalize(
+            _maybe_run_with_interpreter(combine_fn)
+        )
+        sample_unwrapped_xs_sliced = [first_slice_copy(inp) for inp in unwrapped_xs]
+        sample_inputs = list(
+            itertools.chain(
+                unwrapped_init,
+                sample_unwrapped_xs_sliced,
+                unwrapped_additional_inputs,
+            )
+        )
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        _check_alias_and_mutation(combine_fn, sample_inputs, "scan", pre_dispatch)
+        ret = scan_op(
+            functional_combine_fn,
+            unwrapped_init,
+            unwrapped_xs,
+            unwrapped_additional_inputs,
+        )
+    return ctx.wrap_tensors(ret)
+# dense implementation for scan. Used for testing only.
+def _fake_scan(combine_fn, init, xs=None, dim=0, reverse=False):
+    carry_leaves, carry_spec = pytree.tree_flatten(init)
+    inp_leaves, inp_spec = pytree.tree_flatten(xs)
+    if xs is None or len(inp_leaves) == 0:
+        return init, []
+    result_flat = []
+    carry = carry_leaves
+    op = reversed if reverse else lambda x: x
+    dummy_carry, dummy_out = combine_fn(
+        pytree.tree_unflatten(carry, carry_spec),
+        pytree.tree_unflatten(
+            [first_slice_copy(elem, dim) for elem in inp_leaves],
+            inp_spec,
+        ),
+    )
+    dummy_out_leaves, dummy_out_spec = pytree.tree_flatten(dummy_out)
+    num_leaves = len(dummy_out_leaves)
+    for ind in op(range(inp_leaves[0].size(dim))):
+        xs = [elem.select(dim, ind) for elem in inp_leaves]
+        carry, y = combine_fn(
+            pytree.tree_unflatten(carry, carry_spec),
+            pytree.tree_unflatten(xs, inp_spec),
+        )
+        carry, _ = pytree.tree_flatten(carry)
+        y, _ = pytree.tree_flatten(y)
+        result_flat.append(y)
+    results = [
+        torch.stack([e[leave_ind] for e in op(result_flat)])
+        for leave_ind in range(num_leaves)
+    ]
+    return (
+        pytree.tree_unflatten(carry, carry_spec),
+        pytree.tree_unflatten(results, dummy_out_spec),
+    )

archive/.venv/Lib/site-packages/torch/_higher_order_ops/schema.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import copy
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.node import Target
+# Below is an implementation of generating FunctionSchema from example values.
+# This is helpful for generating FunctionSchema for HigherOrderOperator, where
+# we don't have a function to inspect and each call of the higher order operator
+# would have different schema.
+@dataclass(frozen=True)
+class HopArgumentInfo:
+    # Could give a name to the operand by default it's empty string.
+    name: str
+    example_value: Any
+    # Provide an default_value
+    default_value: Any
+    # Whether this arugment gets mutated in the hop subgraph.
+    # For output, this should always be False
+    is_mutated: bool
+    kw_only: bool
+class HopArgumentInfoGen:
+    @staticmethod
+    def from_example(
+        example_value: Any,
+        *,
+        name: str = "",
+        default_value: Optional[Any] = None,
+        is_mutated: bool = False,
+        kw_only: bool = False,
+    ) -> HopArgumentInfo:
+        if default_value is not None:
+            assert type(example_value) == type(
+                default_value
+            ), f"example_value type {type(example_value)} doesn't match default_value type: {type(default_value)}"
+        return HopArgumentInfo(
+            name=name,
+            example_value=example_value,
+            default_value=default_value,
+            is_mutated=is_mutated,
+            kw_only=kw_only,
+        )
+class CTypeGen:
+    convert_to_base_ty = {
+        int: torch._C.IntType.get(),
+        float: torch._C.FloatType.get(),
+        str: torch._C.StringType.get(),
+        bool: torch._C.BoolType.get(),
+    }
+    # should return torch._C.JitType but that annotation is busted
+    @staticmethod
+    def from_example(obj: Any) -> Any:
+        import torch
+        if isinstance(obj, torch.fx.GraphModule):
+            return torch._C.AnyType.get()
+        elif isinstance(obj, torch.SymInt):
+            return torch._C.SymIntType.get()
+        return torch._C._jit_try_infer_type(obj).type()
+class CArgumentGen:
+    @staticmethod
+    def from_hop_argument_info(
+        arg_idx: int, arg_info: HopArgumentInfo, is_output: bool = False
+    ) -> Any:
+        typ = CTypeGen.from_example(arg_info.example_value)
+        if is_output:
+            return torch._C.Argument("", typ, None, None, False, None)
+        alias_set = set({f"alias::a{arg_idx}"}) if arg_info.is_mutated else set()
+        alias_info = torch._C._AliasInfo(arg_info.is_mutated, alias_set, alias_set)  # type: ignore[attr-defined]
+        return torch._C.Argument(
+            arg_info.name,
+            typ,
+            None,
+            arg_info.default_value,
+            arg_info.kw_only,
+            alias_info,
+        )
+class HopSchemaGenerator:
+    def __init__(self, hop: torch._ops.HigherOrderOperator):
+        self.arg_infos: list[HopArgumentInfo] = []
+        self.example_outputs: list[Any] = []
+        self.schema_tree_spec: Optional[pytree.TreeSpec] = None
+        self.hop = hop
+    def add_arg(
+        self,
+        name: str,
+        example_value: Any,
+        default_value: Optional[Any] = None,
+        is_mutated: bool = False,
+        kw_only: bool = False,
+    ) -> None:
+        if callable(example_value):
+            assert isinstance(
+                example_value, (torch.fx.GraphModule, torch._ops.OperatorBase)
+            ), (
+                "Expect callable to be a GraphModule or an. Please call materialize_as_graph first "
+                f"to turn callable arguments {example_value} into a GraphModule."
+            )
+        _, flat_spec = pytree.tree_flatten(example_value)
+        if not flat_spec.is_leaf():
+            raise RuntimeError(
+                f"example_value {example_value} is not a leaf node. "
+                "Please only add flattened inputs to the hop schema. "
+                "If you need some structure in the arguments, please"
+                "add_arg for flattened args one by one then "
+                "call add_schema_tree_spec to register the original pytree "
+                " spec of the args."
+            )
+        arg_info = HopArgumentInfoGen.from_example(
+            example_value=example_value,
+            name=name,
+            default_value=default_value,
+            is_mutated=is_mutated,
+            kw_only=kw_only,
+        )
+        self.arg_infos.append(arg_info)
+    def add_output(self, output: Any) -> None:
+        self.example_outputs.append(output)
+    def add_schema_tree_spec(self, *args: Any, **kwargs: Any) -> None:
+        """schema tree spec is the tree spec from flattening all inputs to the hop with pytree.tree_flatten
+        Since torch.FunctionSchema only have proper mutation/alias support for flattened inputs, we need
+        to store the tree spec in order to reconstruct the inputs to the hop.
+        """
+        self.schema_tree_spec = pytree.tree_flatten((args, kwargs))[1]
+    def gen_schema(self) -> torch._C.FunctionSchema:
+        for i, arg_info in enumerate(self.arg_infos):
+            arg_spec = pytree.tree_flatten(arg_info.example_value)[1]
+            if not arg_spec.is_leaf() and self.schema_tree_spec is None:
+                raise RuntimeError(
+                    f"example_value of arg_infos[{i}] is {arg_info.example_value}, which is not a leaf node. "
+                    "Please call add_schema_tree_spec to add a schema tree spec first. "
+                    "Or consider changing the hop's signature to only take flattened arguments."
+                )
+        return CFunctionSchemaGen.from_hop_argument_info(
+            str(self.hop),
+            self.arg_infos,
+            HopArgumentInfoGen.from_example(tuple(self.example_outputs), name="out"),
+            self.schema_tree_spec,
+        )
+class CFunctionSchemaGen:
+    """
+    Note: [HigherOrderOperator schema generation]
+    Each invocation of a HigherOrderOperator will have a different schema.
+    For example, the schema of torch.cond varies depending on the true_fn and
+    false_fn. So we need a way to generate the schema for each invocation of a HOP.
+    We want to enforce the following invariants for HOP's schema:
+        1. Flattened inputs. There should be no pytree structure in it.
+        2. Flattened outputs. Note even if the hop returns a single value, it should be wrapped as a tuple.
+        3. No aliasing. This includes inp-inp aliasing, inp-out aliasing and out-out aliasing.
+    By enforcing these invariants, we could make HOP's schema meets the requirement of schema parser
+    and makes hop easier to handle downstream. For example, suppose we have an invoke_quant_test HOP:
+    class GraphModule(torch.nn.Module):
+        def forward(self, l_x_, l_y_):
+            subgraph_0 = self.subgraph_0
+            invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');
+        class subgraph_0(torch.nn.Module):
+            def forward(self, l_x_, l_y_):
+                add_ = l_x_.add_(1)
+                matmul = l_x_ @ l_y_
+                sin = matmul.sin()
+                child = sin.cos()
+                child_1 = l_x_ + l_y_
+                child_2 = l_x_ - l_y_
+                child_3 = l_x_ @ l_y_
+                return (child, child_1, child_2, child_3)
+    By encoding the inputs of hop into a list of HopArgumentInfo and output as a single HopArgumentInfo,
+    we would get the following schema:
+        invoke_quant_test(Any arg0, Tensor(!) arg1, Tensor arg2, str scheme="\\"nf4\\"") -> (Tensor, Tensor, Tensor, Tensor)
+    """
+    @staticmethod
+    def from_hop_argument_info(
+        op_name: str,
+        inp_argument_info: list[HopArgumentInfo],
+        out_argument_info: HopArgumentInfo,
+        schema_tree_spec: Optional[pytree.TreeSpec],
+    ) -> Any:
+        args = []
+        for i, arg_info in enumerate(inp_argument_info):
+            args.append(CArgumentGen.from_hop_argument_info(i, arg_info))
+        # NOTE: we want the output to always be a single argument with torch._C.TupleType.
+        assert isinstance(
+            out_argument_info.example_value, tuple
+        ), f"expect out_argument_info's example_value to be a tuple but got {out_argument_info.example_value}"
+        assert (
+            not out_argument_info.is_mutated
+        ), "out_argument_info.is_mutated should always be set to False."
+        rets = None
+        if len(out_argument_info.example_value) == 1:
+            rets = [CArgumentGen.from_hop_argument_info(0, out_argument_info, True)]
+        else:
+            rets = [
+                CArgumentGen.from_hop_argument_info(
+                    i,
+                    HopArgumentInfoGen.from_example(
+                        name=f"out{i}",
+                        example_value=val,
+                        default_value=None,
+                        is_mutated=False,
+                    ),
+                    is_output=True,
+                )
+                for i, val in enumerate(out_argument_info.example_value)
+            ]
+        return HopSchema(
+            op_name,
+            "",
+            args,
+            rets,
+            False,
+            False,
+            schema_tree_spec,
+        )
+class HopSchema(torch._C.FunctionSchema):
+    def __init__(
+        self,
+        name: str,
+        overload_name: str,
+        arguments: list[torch._C.Argument],
+        returns: list[torch._C.Argument],
+        is_vararg: bool,
+        is_varret: bool,
+        schema_tree_spec: Optional[pytree.TreeSpec],
+    ):
+        self.tree_spec = schema_tree_spec
+        self.is_vararg = is_vararg
+        self.is_varret = is_varret
+        super().__init__(
+            name,
+            overload_name,
+            arguments,
+            returns,
+            self.is_vararg,
+            self.is_varret,
+        )
+    def __deepcopy__(self, memo: Any) -> "HopSchema":
+        # Need to additionally copy the tree_spec since
+        # it's not a member of torch._C.FunctionSchema
+        return HopSchema(
+            self.name,
+            self.overload_name,
+            self.arguments,
+            self.returns,
+            self.is_vararg,
+            self.is_varret,
+            copy.deepcopy(self.tree_spec),
+        )
+def find_hop_schema(
+    gm: torch.fx.GraphModule, target: Target
+) -> list[torch._C.FunctionSchema]:
+    schemas = []
+    for node in gm.graph.find_nodes(op="call_function", target=target):
+        def _get_example_value(node: torch.fx.Node) -> Any:
+            if node.op == "get_attr":
+                assert isinstance(node.target, str)
+                return getattr(gm, node.target)
+            else:
+                return (
+                    node.meta["example_value"]
+                    if "example_value" in node.meta
+                    else node.meta["val"]
+                )
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.fx.Node,
+            _get_example_value,
+            (node.args, node.kwargs),
+        )
+        schema = node.target.gen_schema(*fake_args, **fake_kwargs)
+        schemas.append(schema)
+    return schemas

archive/.venv/Lib/site-packages/torch/_higher_order_ops/strict_mode.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# mypy: allow-untyped-defs
+import torch
+import torch._subclasses.functional_tensor
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._functorch.utils import exposed_in
+from torch._higher_order_ops.utils import _set_compilation_env, autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+@exposed_in("torch")
+def strict_mode(callable, operands):
+    from torch._dynamo.backends.debugging import (
+        make_eager_backend_with_torch_function_modes,
+    )
+    if torch.compiler.is_dynamo_compiling():
+        return strict_mode_op(callable, operands)
+    with _set_compilation_env():
+        with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+            with _temp_remove_pre_dispatch_torch_function_mode() as predispatch_mode:
+                modes = [metadata_mode, predispatch_mode]
+                modes = [mode for mode in modes if mode is not None]
+                if modes:
+                    backend = make_eager_backend_with_torch_function_modes(modes)
+                else:
+                    backend = "eager"
+                with torch._dynamo.utils.disable_cache_limit():
+                    return torch.compile(
+                        strict_mode_op, backend=backend, fullgraph=True
+                    )(callable, operands)
+class StrictMode(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("strict_mode")
+    def __call__(self, callable, operands):
+        return super().__call__(callable, operands)
+strict_mode_op = StrictMode()
+@strict_mode_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def strict_mode_op_dense(callable, operands):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return callable(*operands)
+strict_mode_op.py_autograd_impl(
+    autograd_not_implemented(strict_mode_op, deferred_error=True)
+)
+@strict_mode_op.py_impl(ProxyTorchDispatchMode)
+def inner(mode, callable, operands):
+    return trace_strict_mode(mode, strict_mode_op, callable, operands)
+def trace_strict_mode(mode, strict_mode_op, callable, operands):
+    pre_dispatch = getattr(mode, "pre_dispatch", False)
+    with disable_proxy_modes_tracing():
+        graph = make_fx(callable, pre_dispatch=pre_dispatch)(*operands)
+    graph_name = mode.tracer.get_fresh_qualname("strict_graph_")
+    mode.tracer.root.register_module(graph_name, graph)
+    args = (graph, operands)
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function", strict_mode_op, proxy_args, {}, name="strict_mode"
+    )
+    out = graph(*operands)
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+@strict_mode_op.py_impl(FakeTensorMode)
+def strict_mode_fake_tensor_mode(mode, callable, operands):
+    with mode:
+        true_outs = callable(*operands)
+    return true_outs
+@strict_mode_op.py_functionalize_impl
+def strict_mode_func(ctx, callable, inputs):
+    unwrapped_inputs = ctx.unwrap_tensors(inputs)
+    with ctx.redispatch_to_next():
+        functional_callable = ctx.functionalize(callable)
+        cond_return = strict_mode_op(functional_callable, unwrapped_inputs)
+        return ctx.wrap_tensors(cond_return)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/torchbind.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# mypy: allow-untyped-defs
+import logging
+from contextlib import contextmanager
+import torch
+from torch._C import DispatchKey  # @manual
+from torch._functorch._aot_autograd.utils import KNOWN_TYPES
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._library.fake_class_registry import (
+    _is_script_object,
+    _ns_and_class_name,
+    FakeScriptObject,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.fx.node import has_side_effect
+from torch.utils import _pytree as pytree
+log = logging.getLogger(__name__)
+# The call_torchbind operator represents a method invocation on a torchbind
+# object. The calling convention is:
+#   call_torchbind(self: ScriptObject, method_name: str, *method_args, **method_kwargs)
+# We do not expect users to write this operator directly. Instead it will be
+# emitted by Dynamo when tracing encounters a torchbind object.
+class CallTorchBind(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("call_torchbind")
+    def __call__(self, obj, method, *args, **kwargs):
+        return super().__call__(obj, method, *args, **kwargs)
+    @staticmethod
+    def schema(obj, method) -> torch.FunctionSchema:
+        """
+        Returns the schema of ``CallTorchbind.__call__``.
+        """
+        assert isinstance(obj, torch._inductor.ir.TorchBindObject)
+        val = obj.get_real_obj()
+        schema = val._get_method(method).schema
+        schema_str = str(schema)
+        new_schema_str = f"call_torchbind({str(schema.arguments[0].real_type)} {schema.arguments[0].name},"
+        first_comma_index = schema_str.find(",")
+        if first_comma_index == -1:
+            # If no comma is found, find the last closing parenthesis
+            first_comma_index = schema_str.rfind(") ->")
+        new_schema_str = new_schema_str + " str method" + schema_str[first_comma_index:]
+        new_schema = torch._C.parse_schema(new_schema_str)
+        return new_schema
+call_torchbind = CallTorchBind()
+# Register this operator as side-effectful with FX.
+# TODO: this is not really sufficient. While passes (hopefully) check
+# Node.is_impure() and make good decisions, we also assume we can execute the
+# graph as many times as we want without changing behavior, which is NOT true of
+# ops that mutate torchbind object state.
+has_side_effect(call_torchbind)
+_orig_scriptmethod_call = torch.ScriptMethod.__call__
+def torchbind_method_redispatch(self, *args, **kwargs):
+    if _is_script_object(self.raw_owner):
+        return call_torchbind(self.raw_owner, self.name, *args, **kwargs)
+    return _orig_scriptmethod_call(self, *args, **kwargs)
+@contextmanager
+def enable_torchbind_tracing():
+    """Context manager that acts as a feature flag to enable torchbind tracing
+    behavior. Once torchbind tracing has been stabilized, we can remove this and
+    turn it always on.
+    """
+    try:
+        KNOWN_TYPES.append(torch.ScriptObject)
+        torch.ScriptMethod.__call__ = torchbind_method_redispatch  # type: ignore[method-assign]
+        yield
+    finally:
+        assert (
+            KNOWN_TYPES.pop() is torch.ScriptObject
+        ), "Someone else messed with KNOWN_TYPES during tracing, exploding."
+        torch.ScriptMethod.__call__ = _orig_scriptmethod_call  # type: ignore[method-assign]
+@call_torchbind.py_impl(DispatchKey.CompositeExplicitAutograd)
+def call_torchbind_impl(obj, method, *args, **kwargs):
+    if isinstance(obj, torch.ScriptObject):
+        return _orig_scriptmethod_call(getattr(obj, method), *args, **kwargs)
+    elif isinstance(obj, FakeScriptObject):
+        return getattr(obj.wrapped_obj, method)(*args, **kwargs)
+    else:
+        raise RuntimeError(f"Unsupported first arg type {type(obj)} for call_torchbind")
+@call_torchbind.py_impl(ProxyTorchDispatchMode)
+def inner(mode, *args, **kwargs):
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        call_torchbind,
+        proxy_args,
+        proxy_kwargs,
+    )
+    out = call_torchbind(*args, **kwargs)
+    obj, method, *_rest_args = args
+    if isinstance(obj, torch.ScriptObject):
+        ns, class_name = _ns_and_class_name(
+            obj._type().qualified_name()  # type: ignore[attr-defined]
+        )
+        log.warning(
+            "Tracing torchbind method %s.%s with real ScriptObject. This may"
+            " cause the original object being mutated. If this is not intended,"
+            ' You can register a fake class with torch._library.register_fake_class("%s::%s").',
+            class_name,
+            method,
+            ns,
+            class_name,
+        )
+    ret = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    if "val" not in out_proxy.node.meta:
+        assert out is None or isinstance(
+            out, (int, float, bool)
+        ), "Currently, only these constant dtypes are supported to be returned from torchbind methods."
+        out_proxy.node.meta["val"] = out
+    return ret
+# When tracing with fake script object, the call_torchbind op will return a fake tensor
+# When tracing with real script object, the call_torchbind op may return a real tensor,
+# we need to convert it to fake tensor mannually. Dynamic shape is surpported.
+@call_torchbind.py_impl(FakeTensorMode)
+def call_torchbind_fake(mode, *args, **kwargs):
+    with mode:
+        out = call_torchbind_impl(*args, **kwargs)
+        return pytree.tree_map_only(
+            torch.Tensor,
+            lambda x: mode.from_tensor(x, static_shapes=True)
+            if not isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
+            else x,
+            out,
+        )
+call_torchbind.py_autograd_impl(
+    autograd_not_implemented(call_torchbind, deferred_error=True)
+)
+@call_torchbind.py_functionalize_impl
+def call_torchbind_func(ctx, *args, **kwargs):
+    from torch._higher_order_ops.effects import handle_effects
+    return handle_effects(
+        ctx.mode._allow_token_discovery, ctx.mode._tokens, call_torchbind, args, kwargs
+    )

archive/.venv/Lib/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py ADDED Viewed

	@@ -0,0 +1,2051 @@

+import collections
+import copy
+import dataclasses
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import threading
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing_extensions import Never
+import sympy
+import torch.fx as fx
+import torch.utils._pytree as pytree
+from torch import SymInt, Tensor
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.experimental.symbolic_shapes import guard_scalar
+from torch.types import IntLikeType
+if TYPE_CHECKING:
+    from triton._C.libtriton.ir import (
+        module as TritonIRModule,
+        operation as TritonIROperation,
+    )
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+    from torch._dynamo.variables.constant import ConstantVariable
+    from torch._dynamo.variables.functions import TritonKernelVariable
+    from torch._subclasses.functional_tensor import BaseFunctionalizeAPI
+    from torch.fx.proxy import Proxy
+    from torch.utils._triton import has_triton
+    TritonMetaParamsType = dict[str, int]
+    TritonGridTupleType = tuple[Union[int, sympy.Expr, SymInt], ...]
+    TritonGridCallableType = Callable[[TritonMetaParamsType], tuple[int, ...]]
+    TritonGridType = Union[TritonGridTupleType, TritonGridCallableType]
+    if has_triton():
+        from triton.runtime.autotuner import Autotuner, Config as TritonConfig
+        from triton.runtime.jit import JITFunction
+    else:
+        class Autotuner:  # type: ignore[no-redef]
+            pass
+        class JITFunction:  # type: ignore[no-redef]
+            pass
+    TritonKernelType = Union[Autotuner, JITFunction]
+    # mypy specifically complains that TritonAutotunerType is not a valid type if Autotuner is not inside of a Union.
+    TritonAutotunerType = Union[Autotuner]
+log = logging.getLogger("torch._dynamo")
+# e.g. for a host-side Triton TMA API call ``create_2d_tma_descriptor(ptr, 50, 60, 32, 15, 4)``,
+# the metadata will look like ``("experimental", ([50, 60], [32, 15], 4))``
+TMAExperimentalMetadata = tuple[
+    str,  # type of TMA (should be "experimental")
+    tuple[
+        list[IntLikeType],  # dims
+        list[IntLikeType],  # block_dims
+        IntLikeType,  # element_size
+    ],
+]
+# e.g. for host-side Triton TMA API call ``TensorDescriptor.from_tensor(ptr, [32, 64])``
+# the metadata will look like ``("stable", ([32, 64],))``
+TMAStableMetadata = tuple[
+    str,  # type of TMA ("experimental" or "stable")
+    tuple[list[IntLikeType],],  # block_shape
+]
+def create_tma_experimental_metadata(
+    dims: list[IntLikeType],
+    block_dims: list[IntLikeType],
+    element_size: IntLikeType,
+) -> TMAExperimentalMetadata:
+    return ("experimental", (dims, block_dims, element_size))
+def maybe_unpack_tma_experimental_metadata(
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+) -> Optional[tuple[list[IntLikeType], list[IntLikeType], IntLikeType]]:
+    if not tma_meta or len(tma_meta) != 2:
+        return None
+    if tma_meta[0] == "experimental":
+        return tma_meta[1]  # type: ignore[return-value]
+    return None
+def create_tma_stable_metadata(
+    block_shape: list[IntLikeType],
+) -> TMAStableMetadata:
+    return ("stable", (block_shape,))
+def maybe_unpack_tma_stable_metadata(
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+) -> Optional[tuple[list[IntLikeType]]]:
+    if not tma_meta or len(tma_meta) != 2:
+        return None
+    if tma_meta[0] == "stable":
+        return tma_meta[1]  # type: ignore[return-value]
+    return None
+# TMADescriptorMetadata maps kernel parameter names to the metadata that allows
+# reconstructing TMA descriptors from the underlying tensors (passed as kernel
+# arguments in the fx graph, instead of the TMA descriptors).
+#
+# Since there are two TMA APIs (the old "experimental" API and the new "stable" API),
+# each entry in the dict is a tuple that starts with a string, either "experimental"
+# or "stable". The second entry in the tuple is another tuple, with data that depends
+# on the API type (see TMAExperimentalMetadata and TMAStableMetadata above).
+#
+# These are stored as raw tuples (instead of classes) for ease of serialization.
+TMADescriptorMetadata = dict[
+    str,  # kernel parameter name
+    Union[TMAExperimentalMetadata, TMAStableMetadata],
+]
+###############################################################################
+# Kernel Side Table
+# We cannot put Triton Kernels into the FX graph as the graph nodes
+# do not support arbitrary functions.
+# Use a side table.
+# We use two dicts so that fetching both the kernel and id are O(1)
+class KernelSideTable:
+    id_to_kernel: dict[int, "TritonKernelType"] = {}
+    kernel_to_id: dict["TritonKernelType", int] = {}
+    constant_args: dict[int, dict[str, Any]] = {}
+    lock = threading.Lock()
+    # Returns index on the table
+    def add_kernel(self, kernel: "TritonKernelType") -> int:
+        with self.lock:
+            if kernel in self.kernel_to_id:
+                return self.kernel_to_id[kernel]
+            idx = len(self.id_to_kernel)
+            self.id_to_kernel[idx] = kernel
+            self.kernel_to_id[kernel] = idx
+            return idx
+    # Returns the triton kernel at the given index
+    def get_kernel(self, idx: int) -> "TritonKernelType":
+        # No need to lock here as fetching from dict is atomic
+        assert idx in self.id_to_kernel
+        return self.id_to_kernel[idx]
+    # Not every constant arg can be added to the graph. Use this side table
+    # for constant args.
+    def add_constant_args(self, args: dict[str, Any]) -> int:
+        with self.lock:
+            idx = len(self.constant_args)
+            self.constant_args[idx] = args
+            return idx
+    # Returns the constant args
+    def get_constant_args(self, idx: int) -> dict[str, Any]:
+        # No need to lock here as fetching from dict is atomic
+        assert idx in self.constant_args
+        return self.constant_args[idx]
+    # Resets the table (only meant to be used in unit tests)
+    # This is only safe assuming single threaded execution
+    def reset_table(self) -> None:
+        self.id_to_kernel = {}
+        self.kernel_to_id = {}
+        self.constant_args = {}
+kernel_side_table = KernelSideTable()
+###############################################################################
+# Mutation Tracker
+@dataclasses.dataclass(frozen=True)
+class Param:
+    idx: int
+@dataclasses.dataclass(frozen=True)
+class Intermediate:
+    idx: int
+    def fake(self) -> bool:
+        return self.idx < 0
+@dataclasses.dataclass(frozen=True)
+class Op:
+    name: str
+    fn_call_name: Optional[str]
+    args: list[Union[Param, Intermediate]]
+    ret: Intermediate = dataclasses.field(repr=False)
+    # used for scf.yield: see [Note: scf.yield fix-up]
+    sub_idx: Optional[int] = None
+    # used for tt.elementwise_inline_asm
+    # `is_pure = True` assumes the asm block has no side-effects
+    is_pure: bool = False
+    def __post_init__(self) -> None:
+        if self.name == "tt.call":
+            assert self.fn_call_name is not None
+        else:
+            assert self.fn_call_name is None
+def generate_ttir(
+    kernel: "TritonKernelType",
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+) -> tuple["TritonIRModule", list[str]]:
+    """
+    Uses Triton's internal code generation to create TTIR
+    """
+    import sympy
+    import triton
+    import triton.runtime.jit
+    from triton.compiler.compiler import ASTSource
+    from triton.runtime.autotuner import Autotuner
+    from triton.runtime.jit import JITFunction
+    from torch._inductor.utils import (
+        get_triton_attrs_descriptor_version,
+        triton_version_uses_attrs_dict,
+        TritonAttrsDescriptorVersion,
+    )
+    from torch.utils._triton import has_triton_tensor_descriptor_host_tma
+    triton_version = get_triton_attrs_descriptor_version()
+    import torch._inductor.ir
+    from torch._subclasses.fake_tensor import FakeTensor
+    if isinstance(kernel, Autotuner):
+        if len(kernel.configs) > 0:
+            # If we are autotuning, then it doesn't matter which version gets
+            # picked for tracing purposes, so lets pick the first one
+            kwargs = {**kwargs, **kernel.configs[0].kwargs}
+        kernel = kernel.fn
+    assert isinstance(kernel, JITFunction)
+    context = triton._C.libtriton.ir.context()
+    target = triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.compiler.make_backend(target)
+    options = backend.parse_options({})
+    # ignore backend-specific kwargs same way as in the native Triton code
+    # https://github.com/triton-lang/triton/blob/a6bb57d6285e723c58e87dd7cba263db6efff789/python/triton/runtime/jit.py#L594-L596
+    # why this is important for user-defined Triton kernels on AMD: https://github.com/pytorch/pytorch/issues/140800
+    for name in list(kwargs):
+        if name not in kernel.arg_names and name in options.__dict__:
+            kwargs.pop(name)
+    if len(kwargs) != len(kernel.arg_names):
+        raise ValueError(
+            "Incorrect number of arguments passed to kernel: "
+            f"passed {list(kwargs.keys())}, expected {kernel.arg_names}."
+        )
+    # Replace all SymExprs with a regular value for TTIR generation
+    # Replace all FakeTensor/TensorBox with real tensors
+    # These replacements are needed for triton's type, key and config functions
+    ordered_args: dict[str, Any] = {}
+    for name in kernel.arg_names:
+        a = kwargs[name]
+        if isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool, sympy.Expr)):
+            ordered_args[name] = 2
+        elif (
+            stable_meta := maybe_unpack_tma_stable_metadata(
+                tma_descriptor_metadata.get(name, None)
+            )
+        ) is not None:
+            from triton.tools.tensor_descriptor import TensorDescriptor
+            block_shape = stable_meta[0]
+            with torch._C._DisableTorchDispatch():
+                # need 16-byte aligned strides
+                elements_per_dim = max(1, 16 // a.dtype.itemsize)
+                base_tensor = torch.empty(
+                    [elements_per_dim] * len(block_shape), dtype=a.dtype
+                )
+            ordered_args[name] = TensorDescriptor.from_tensor(base_tensor, block_shape)
+        elif isinstance(a, (FakeTensor, torch._inductor.ir.TensorBox)):
+            with torch._C._DisableTorchDispatch():
+                ordered_args[name] = torch.empty(2, dtype=a.dtype)
+        else:
+            ordered_args[name] = a
+    def is_stable_tensor_descriptor_arg(arg: Any) -> bool:
+        if has_triton_tensor_descriptor_host_tma():
+            from triton.tools.tensor_descriptor import TensorDescriptor
+            if isinstance(arg, TensorDescriptor):
+                return True
+        return False
+    def is_tensor_like_arg(arg: Any) -> bool:
+        if isinstance(arg, Tensor) or is_stable_tensor_descriptor_arg(arg):
+            return True
+        return False
+    # Note: one would expect that each input to the triton kernel maps to
+    # one input parameter in the TTIR. This is _not_ true for TMA descriptors:
+    # one TMA descriptor gets converted into:
+    #   * one TMA descriptor input
+    #   * N strides, for a rank-N tensor
+    #   * N sizes, for a rank-N tensor
+    # To account for this, we inject some fake arg names as placeholders for
+    # the stride and size parameters.
+    def get_tensor_names(name: str, arg: Any) -> list[str]:
+        if isinstance(arg, Tensor):
+            return [name]
+        if is_stable_tensor_descriptor_arg(arg):
+            stable_meta = maybe_unpack_tma_stable_metadata(
+                tma_descriptor_metadata[name]
+            )
+            assert stable_meta is not None
+            block_shape = stable_meta[0]
+            tensor_rank = len(block_shape)
+            names = [name]
+            names.extend(name + f" STRIDE PLACEHOLDER {i}" for i in range(tensor_rank))
+            names.extend(name + f" SIZE PLACEHOLDER {i}" for i in range(tensor_rank))
+            return names
+        return []
+    ordered_tensor_names = list(
+        itertools.chain.from_iterable(
+            get_tensor_names(name, arg) for name, arg in ordered_args.items()
+        )
+    )
+    def _get_specialization(args):  # type: ignore[no-untyped-def]
+        # Support multiple triton versions.
+        # This code basically copies JITFunction.run() logic to get the attrs to construct an ASTSource.
+        if triton_version == TritonAttrsDescriptorVersion.V1_COMPILER:
+            return kernel._get_config(*args)
+        elif triton_version in {
+            TritonAttrsDescriptorVersion.V2_BACKENDS,
+            TritonAttrsDescriptorVersion.V3_BACKENDS_TUPLE,
+        }:
+            from triton.backends.compiler import AttrsDescriptor  # noqa: F401
+            target = triton.runtime.driver.active.get_current_target()
+            backend_ = triton.compiler.compiler.make_backend(target)
+            return backend_.get_attrs_descriptor(args, kernel.params)
+        else:
+            assert (
+                get_triton_attrs_descriptor_version()
+                == TritonAttrsDescriptorVersion.V4_DICT
+            )
+            # specialize_impl switched to create_specialize_impl in https://github.com/triton-lang/triton/pull/6099
+            if hasattr(triton.runtime.jit, "create_specialize_impl"):
+                try:
+                    # Latest versions of Triton take specialize_extra as an arg to create_specialize_impl
+                    specialize_impl = triton.runtime.jit.create_specialize_impl(
+                        specialize_extra=backend.get_arg_specialization
+                    )
+                except TypeError:  # Unknown arg `specialize_extra`
+                    # Older versions of Triton take specialize_extra as an arg to specialize_impl
+                    specialize_impl = functools.partial(
+                        triton.runtime.jit.create_specialize_impl(),
+                        specialize_extra=backend.get_arg_specialization,
+                    )
+            else:
+                from triton.runtime.jit import specialize_impl as specialize_impl_orig
+                specialize_impl = functools.partial(
+                    specialize_impl_orig,
+                    specialize_extra=backend.get_arg_specialization,
+                )
+            from triton._utils import find_paths_if, get_iterable_path
+            # logic is copied from: binder = create_function_from_signature(self.signature, self.params, backend)
+            attrvals = []
+            for arg, kp in zip(args, kernel.params):
+                if kp.is_constexpr:
+                    attrvals.append(arg)
+                else:
+                    spec = specialize_impl(
+                        arg,
+                        is_const=kp.is_const,
+                        specialize_value=not kp.do_not_specialize,
+                        align=not kp.do_not_specialize_on_alignment,
+                    )
+                    attrvals.append(spec[1])
+            attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
+            attrs = {
+                k: backend.parse_attr(get_iterable_path(attrvals, k)) for k in attrs
+            }
+            return attrs
+    specialization = _get_specialization(ordered_args.values())
+    constants = {
+        name: arg for name, arg in ordered_args.items() if not is_tensor_like_arg(arg)
+    }
+    if (mangle_type := getattr(triton.runtime.jit, "mangle_type", None)) is not None:
+        def get_signature_value(idx: int, arg: Any) -> str:
+            if kernel.params[idx].is_constexpr:
+                return "constexpr"
+            return mangle_type(arg)
+    else:
+        def get_signature_value(idx: int, arg: Any) -> str:
+            return kernel._type_of(kernel.key_of(arg))
+    if triton_version_uses_attrs_dict():
+        # In newer versions of Triton, the signature includes constexpr args
+        signature = {
+            name: get_signature_value(i, arg)
+            for i, (name, arg) in enumerate(ordered_args.items())
+        }
+    else:
+        # In older versions of Triton, the signature does not include constexpr args
+        signature = {
+            name: get_signature_value(i, arg)
+            for i, (name, arg) in enumerate(ordered_args.items())
+            if i not in kernel.constexprs
+        }
+    triton._C.libtriton.ir.load_dialects(context)
+    backend.load_dialects(context)
+    src = ASTSource(kernel, signature, constants, specialization)
+    # Triton changes ASTSource.make_ir to take 3/4 arguments. Handle
+    # backward compatibility here.
+    make_ir_sig_params = len(inspect.signature(src.make_ir).parameters)
+    get_codegen_implementation_sig_params = len(
+        inspect.signature(backend.get_codegen_implementation).parameters
+    )
+    if make_ir_sig_params == 2:
+        ttir_module = src.make_ir(options, context)
+    elif make_ir_sig_params == 3:
+        codegen_fns = backend.get_codegen_implementation()
+        ttir_module = src.make_ir(options, codegen_fns, context)
+    else:
+        codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
+        codegen_fns = backend.get_codegen_implementation(*codegen_args)
+        module_map = backend.get_module_map()
+        ttir_module = src.make_ir(options, codegen_fns, module_map, context)
+    if not ttir_module.verify():
+        raise RuntimeError("Verification for TTIR module has failed")
+    return ttir_module, ordered_tensor_names
+def ttir_to_functions(
+    ttir_module: "TritonIRModule",
+) -> dict[str, dict[Intermediate, list[Op]]]:
+    """
+    Walk the `ttir_module` bottom up to mine the `functions` from
+    the structured MLIR entities representing the Triton kernel
+    (mlir::Operation, mlir::Block, mlir::Region).
+    """
+    functions: dict[str, dict[Intermediate, list[Op]]] = {}
+    # block id --> op result (Intermediate) --> one or more ops
+    op_stack: dict[int, dict[Intermediate, list[Op]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    region_id_to_block_ids: dict[int, list[int]] = defaultdict(list)
+    block_id_to_block_arg_ids: dict[int, list[int]] = {}
+    replacements: dict[int, Union[Intermediate, Param]] = {}
+    reindex_map: dict[int, int] = {}
+    next_fake_intermediate = 0
+    def reindex(idx: int) -> int:
+        if idx not in reindex_map:
+            reindex_map[idx] = len(reindex_map)
+        return reindex_map[idx]
+    def mlir_to_functions(op: "TritonIROperation") -> None:
+        name: str = op.get_name()
+        if name == "builtin.module":
+            # this wraps all tt.func ops
+            return
+        operand_ids: list[int] = [
+            reindex(op.get_operand(i).id()) for i in range(op.get_num_operands())
+        ]
+        result_ids: list[int] = [
+            reindex(op.get_result(i).id()) for i in range(op.get_num_results())
+        ]
+        child_block_ids: list[int] = []
+        for i in [op.get_region(i).id() for i in range(op.get_num_regions())]:
+            # as the walk is bottom-up, the region_id_to_block_ids[i]
+            # must be populated by the time we process the enclosing op
+            child_block_ids.extend(region_id_to_block_ids[i])
+        parent_block_id = -1
+        parent_block = op.get_block()
+        if parent_block is not None:
+            parent_block_id = parent_block.id()
+            if parent_block_id not in block_id_to_block_arg_ids:
+                block_id_to_block_arg_ids[parent_block_id] = []
+                for i in range(parent_block.get_num_arguments()):
+                    block_id_to_block_arg_ids[parent_block_id].append(
+                        reindex(parent_block.get_argument(i).id()),
+                    )
+                # the region info is collected via ops' parent blocks to be
+                # used later when the region's encloding op is traversed
+                parent_region = parent_block.get_parent()
+                if parent_region is not None:
+                    region_id_to_block_ids[parent_region.id()].append(parent_block_id)
+        nonlocal next_fake_intermediate
+        if name == "tt.func":
+            # for function ops: gather and inline
+            # the ops from all child blocks
+            fn_ops = defaultdict(list)
+            for child_block_id in child_block_ids:
+                for result, block_fn_ops in op_stack.pop(child_block_id).items():
+                    for block_fn_op in block_fn_ops:
+                        fn_ops[result].append(block_fn_op)
+            # replace the corresponding Intermediates in the
+            # child op args with the function args (Params)
+            for i, idx in enumerate(block_id_to_block_arg_ids[child_block_ids[0]]):
+                replacements[idx] = Param(i)
+            for fn_op_list in fn_ops.values():
+                for fn_op in fn_op_list:
+                    for i in range(len(fn_op.args)):
+                        arg = fn_op.args[i]
+                        seen = set()  # to break cycles
+                        # there can be transitive replacements, but likely
+                        # no cycles (we keep the `seen` set just in case)
+                        while (
+                            isinstance(arg, Intermediate)
+                            and arg.idx in replacements
+                            and arg.idx not in seen
+                        ):
+                            seen.add(arg.idx)
+                            arg = fn_op.args[i] = replacements[arg.idx]
+            # next function capture starts
+            # with empty replacements
+            replacements.clear()
+            fn_name = op.get_str_attr("sym_name")
+            functions[fn_name] = fn_ops
+        elif child_block_ids:
+            if name in {"scf.if", "scf.for", "scf.while", "tt.reduce", "tt.scan"}:
+                # for blocked ops: inline the enclosed ops into
+                # the parent block + rewire the last op in each
+                # child block to return the block result
+                return_ops = []
+                for block_id in child_block_ids:
+                    if name == "scf.for":
+                        # example:
+                        # %result = scf.for %iv = %lb to %ub step %step iter_args(%arg = %init) -> (i32) ...
+                        # block args: 2 (%iv, %arg)
+                        # op operands: 4 (%lb, %ub, %step, %init)
+                        # `%arg` is mapping to `%init`
+                        for i, idx in enumerate(block_id_to_block_arg_ids[block_id]):
+                            if i == 0:
+                                next_fake_intermediate -= 1
+                                replacements[idx] = Intermediate(next_fake_intermediate)
+                            else:
+                                replacements[idx] = Intermediate(operand_ids[i + 2])
+                    elif name == "scf.while":
+                        # example:
+                        # %3:3 = scf.while (%arg2 = %1, %arg3 = %2, %arg4 = %c0_i32_8) ...
+                        # block args: 3 (%arg2, %arg3, %arg4)
+                        # op operands: 3 (%1, %2, %c0_i32_8)
+                        # `%arg2` is mapping to `%1`, `%arg3` is mapping to `%2`, ...
+                        for i, idx in enumerate(block_id_to_block_arg_ids[block_id]):
+                            replacements[idx] = Intermediate(operand_ids[i])
+                    elif name == "scf.if":
+                        # the scf block args are ignored by the pass. but, as they
+                        # may be used as operands of the ops inside the block
+                        # (and nested blocks inlined in the current block by now),
+                        # they are replaced by new fake Intermediates to avoid "this
+                        # operand is not returned by any other op in the fn" error
+                        # in the downstream analysis
+                        for idx in block_id_to_block_arg_ids[block_id]:
+                            next_fake_intermediate -= 1
+                            replacements[idx] = Intermediate(next_fake_intermediate)
+                    else:
+                        assert name in ("tt.reduce", "tt.scan")
+                        # wire the block arguments to the op arguments
+                        num_operands = len(operand_ids)
+                        block_arg_ids = block_id_to_block_arg_ids[block_id]
+                        assert len(block_arg_ids) == 2 * num_operands, (
+                            f"{name} is expected to have twice as "
+                            "many block arguments as op arguments: "
+                            f"{operand_ids=}, {block_arg_ids=}."
+                        )
+                        for i, idx in enumerate(block_arg_ids):
+                            # for a tt.reduce/tt.scan op with N arguments, the block
+                            # arguments comprise N reduced values followed by
+                            # N current values corresponding to the N op args
+                            replacements[idx] = Intermediate(
+                                operand_ids[i % num_operands]
+                            )
+                    if block_id in op_stack:
+                        block_ops = op_stack.pop(block_id)
+                        if not block_ops:
+                            continue
+                        last_ret, last_ops = block_ops.popitem()
+                        if all(
+                            op.name
+                            in ("scf.yield", "tt.reduce.return", "tt.scan.return")
+                            for op in last_ops
+                        ):
+                            # if last_ops are all return ops, treat them separately
+                            return_ops.extend(last_ops)
+                        else:
+                            # otherwise, return last_ops to the block
+                            block_ops[last_ret] = last_ops
+                        for op_result, child_ops in block_ops.items():
+                            op_stack[parent_block_id][op_result].extend(child_ops)
+                scf_results = [Intermediate(idx) for idx in result_ids]
+                if return_ops and all(
+                    (op.name == "scf.yield" and len(result_ids) == len(op.args))
+                    for op in return_ops
+                ):
+                    # [Note: scf.yield fix-up]
+                    #
+                    # TL;DR: if our scf.yield takes N args, then we'll create N scf.yield ops to handle each of the
+                    # args.
+                    #
+                    #      **Context**:
+                    # During mutation analysis, the analysis pass will identify mutating ops (e.g. tt.store)
+                    # and then DFS upwards towards the parameters of the function. Specifically, the analysis pass
+                    # looks at the mutated arg in tt.store; then looks for its source ops; and then recurses on the
+                    # arguments to each of the source ops.
+                    #
+                    # In the case of scf.if/scf.for, we may have multiple return ops, each passed as an arg
+                    # to scf.yield:
+                    #
+                    # %18:2 = scf.if %... -> (!tt.ptr<f32>, !tt.ptr<f32>) {
+                    #   ...
+                    #   scf.yield %1, %2
+                    # } else {
+                    #   scf.yield %3, %4
+                    # }
+                    #
+                    # And for each of the returns of the scf.if, we'd naively assign the source op of each of the
+                    # return values to be the scf.yields. But the scf.yields take _all_ the returns as arguments.
+                    # Therefore, if _any_ of the return values of the scf.if are mutated, then the analysis pass
+                    # would mark _all_ of the yield args as mutated.
+                    #
+                    #      **Solution**:
+                    # For the purposes of this analysis pass, we create N yield ops - one for each
+                    # return-val/yield-arg. In the example above, we'll have two scf.yield's for each branch of the
+                    # scf.if.
+                    for return_op in return_ops:
+                        for i, (scf_result, yield_arg) in enumerate(
+                            zip(scf_results, return_op.args)
+                        ):
+                            sub_yield_op = Op(
+                                return_op.name,
+                                return_op.fn_call_name,
+                                [yield_arg],
+                                return_op.ret,
+                                sub_idx=i,
+                            )
+                            op_stack[parent_block_id][scf_result].append(sub_yield_op)
+                else:
+                    for scf_result in scf_results:
+                        for return_op in return_ops:
+                            op_stack[parent_block_id][scf_result].append(return_op)
+            else:
+                raise RuntimeError(
+                    f"Unknown blocked function: {name}. Can't capture the TTIR."
+                )
+        else:
+            callee = None
+            if name == "tt.call":
+                callee = op.get_flat_symbol_ref_attr("callee")
+            args: list[Union[Param, Intermediate]] = [
+                Intermediate(operand) for operand in operand_ids
+            ]
+            block_ops = op_stack[parent_block_id]
+            is_pure = False
+            # Handle the case for tt.elementwise_inline_asm to set `is_pure` for mutation analysis
+            if name == "tt.elementwise_inline_asm":
+                is_pure = op.get_bool_attr("pure")
+            if result_ids:
+                for result_id in result_ids:
+                    res = Intermediate(result_id)
+                    block_ops[res].append(Op(name, callee, args, res, is_pure=is_pure))
+            else:
+                next_fake_intermediate -= 1
+                fake_res = Intermediate(next_fake_intermediate)
+                block_ops[fake_res].append(
+                    Op(name, callee, args, fake_res, is_pure=is_pure)
+                )
+    ttir_module.walk(mlir_to_functions)
+    return functions
+class MemoizeWithCycleCheck:
+    fn: Callable[..., Any]
+    cache: dict[tuple[Any], Any]
+    def __init__(self, fn: Callable[..., Any]) -> None:
+        self.fn = fn
+        self.reset()
+    def __call__(
+        self,
+        functions: dict[str, dict[Intermediate, list[Op]]],
+        fn_name: str,
+        *args: Any,
+    ) -> list[bool]:
+        key: tuple[Any, ...] = (fn_name, *args)
+        if key not in self.cache:
+            self.cache[key] = None
+            self.cache[key] = self.fn(functions, fn_name, *args)
+        if self.cache[key] is None:
+            raise RuntimeError("Recursion is not supported")
+        return self.cache[key]
+    def reset(self) -> None:
+        self.cache = {}
+@MemoizeWithCycleCheck
+def get_tma_stores(
+    functions: dict[str, dict[Intermediate, list[Op]]], fn_name: str
+) -> set[Union[Intermediate, Param]]:
+    """
+    Identifies all intermediates and parameters that are written to by a
+    `tt.experimental_descriptor_store`. It tracks only the specific values
+    written to via experimental_descriptor_store and the input values to
+    `tt.reinterpret_tensor_descriptor` used to construct the direct inputs
+    to tt.experimental_descriptor_store - not any recursive values
+    used to construct those values.
+    For example: for
+      tt.reinterpret_tensor_descriptor(Intermediate(idx=0), ...)
+      Intermediate(idx=1) = tt.experimental_descriptor_store(Intermediate(idx=0), ...)
+    this function will return [Intermediate(idx=0), Intermediate(idx=1)],
+    However
+      Intermediate(idx=4) = arith.addptr(Intermediate(idx=2), Intermediate(idx=3))
+      Intermediate(idx=5) = tt.experimental_descriptor_store(Intermediate(idx=4), ...)
+      tt.experimental_descriptor_store(Intermediate(idx=5), ...)
+    this function will mark only idx=4 and idx=5 (but not idx=2 or idx=3)
+    If an intermediate/parameter is passed into a function and is written to
+    via experimental_descriptor_store within that function, the argument to the
+    function will also be marked.
+    """
+    result: set[Union[Intermediate, Param]] = set()
+    ops = functions[fn_name]
+    for op_list in ops.values():
+        for op in op_list:
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                tma_stores = get_tma_stores(functions, op.fn_call_name)
+                for i, inp in enumerate(op.args):
+                    if Param(idx=i) in tma_stores:
+                        result.add(inp)
+            elif op.name == "tt.experimental_descriptor_store":
+                assert len(op.args) >= 1
+                result.add(op.args[0])
+    for val in list(result):
+        if val in ops:
+            if not isinstance(val, Intermediate):
+                continue
+            for op in ops[val]:
+                if op.name == "tt.reinterpret_tensor_descriptor":
+                    assert len(op.args) >= 1
+                    result.add(op.args[0])
+    return result
+@MemoizeWithCycleCheck
+def analyze_kernel_mutations(
+    functions: dict[str, dict[Intermediate, list[Op]]], fn_name: str, num_args: int
+) -> list[bool]:
+    """
+    Analyzes the graph to detect all sinks from a predefined list of sinks
+    by using triton's MemWrite trait list. NOTE: What if triton exposed this?
+    From each sink, it traverses the CFG backwards to identify all the input
+    pointers that are mutated.
+    """
+    # Name of mutation op to mutated parameter indices
+    # List from Triton Github include/triton/Dialect/Triton/IR/TritonOps.td
+    # All the OPs that have MemWrite trait.
+    # What if Triton exposed this?
+    MUTATION_OPS = {
+        "tt.store": [0],
+        "tt.atomic_cas": [0],
+        "tt.atomic_rmw": [0],
+        "tt.experimental_descriptor_store": [0],
+        "tt.experimental_tensormap_create": [0],
+        "tt.descriptor_store": [0],
+    }
+    # Ops that we want to bail out on
+    UNKNOWN_OPS = {"tt.elementwise_inline_asm"}
+    stack: list[Union[Param, Intermediate]] = []
+    visited = set()
+    ops = functions[fn_name]
+    tma_stores = get_tma_stores(functions, fn_name)
+    for op_list in ops.values():
+        for op in op_list:
+            # If we encounter an operation with effects that cannot be reliably analyzed
+            # (e.g. `tt.elementwise_inline_asm`), we assume it does not mutate any input parameters.
+            if op.name in UNKNOWN_OPS:
+                if op.name == "tt.elementwise_inline_asm" and op.is_pure:
+                    log.warning(
+                        "TTIR mutation analysis: Skipping pure tt.elementwise_inline_asm op (is_pure=True)"
+                    )
+                    continue
+                raise RuntimeError(
+                    f"ttir analysis hit an op we do not know how to analyze: {op.name}"
+                )
+            if op.name == "tt.experimental_tensormap_create":
+                # Note: this is how we implement experimental_descriptor_store mutation analysis.
+                # for on-device TMA.
+                # experimental_tensormap_store(a, b, ...) stores b to the location specified
+                # by descriptor in the memory of a.
+                # To track this, we first find all the intermediates/params to which we store via
+                # experimental_tensormap_store (get_tma_stores, called above). Then, during this
+                # analysis we wait to find the corresponding experimental_tensormap_create (if it
+                # exists), at which point we will mark the global_ptr as mutated (as done below).
+                assert len(op.args) >= 2
+                if op.args[0] in tma_stores:
+                    stack.append(op.args[1])
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                mutations = analyze_kernel_mutations(
+                    functions, op.fn_call_name, len(op.args)
+                )
+                stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
+            else:
+                stack.extend(op.args[idx] for idx in MUTATION_OPS.get(op.name, []))
+    # The following is an iterative DFS algorithm
+    mutated = [False] * num_args
+    while stack:
+        arg = stack.pop()
+        if arg in visited:
+            continue
+        visited.add(arg)
+        if isinstance(arg, Param):
+            if arg.idx >= num_args:
+                # This is an argument defined in the kernel, not passed in
+                continue
+            mutated[arg.idx] = True
+        elif isinstance(arg, Intermediate) and not arg.fake():
+            for op in ops[arg]:
+                # Skip arguments to load
+                if op.name != "tt.load":
+                    stack.extend(op.args)
+    return mutated
+def identify_mutated_tensors(
+    kernel: "TritonKernelType",
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+) -> list[str]:
+    """
+    Given a triton kernel and the arguments for this kernel, this function
+    1) Retrieves the TTIR converted version of the kernel from Triton's API.
+    2) Parses the TTIR and creates a control flow graph
+    3) Analyzes the graph to detect all input tensor mutations
+    """
+    ttir_module = None
+    functions = None
+    try:
+        ttir_module, ordered_tensor_names = generate_ttir(
+            kernel, kwargs, tma_descriptor_metadata
+        )
+        # extract functions from TTIR using MLIR bindings exposed by Triton code
+        functions = ttir_to_functions(ttir_module)
+        assert functions is not None
+        kernel_name = next(iter(functions.keys()))
+        # Triton codegen modifies the name
+        assert kernel.fn.__name__ in kernel_name
+        # Reset the cache between top level invocations
+        # The cache for analyze kernel mutations is mainly used for cycle
+        # detection, so each top level invocation needs a clean cache
+        analyze_kernel_mutations.reset()
+        get_tma_stores.reset()
+        mutations = analyze_kernel_mutations(
+            functions, kernel_name, len(ordered_tensor_names)
+        )
+        return [
+            ordered_tensor_names[i] for i, mutated in enumerate(mutations) if mutated
+        ]
+    except Exception:
+        log.warning(
+            "Encountered an exception in identify_mutated_tensors, assuming every input is mutated",
+            exc_info=True,
+        )
+        if ttir_module is not None:
+            log.debug("TTIR:\n%s", str(ttir_module))
+        if functions is not None:
+            log.debug("functions:")
+            for name, fn in functions.items():
+                log.debug("===\t%s\t===", name)
+                for ret, ops in fn.items():
+                    log.debug("%s\t=>\t%s", ret, ops)
+        return [key for key, value in kwargs.items() if isinstance(value, Tensor)]
+###############################################################################
+# Triton Kernel Wrappers
+# Used for wrapping a Triton Kernel
+class TritonKernelWrapperMutation(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("triton_kernel_wrapper_mutation", cacheable=True)
+    def __call__(
+        self,
+        kernel_idx: int,
+        constant_args_idx: int,
+        grid: list["TritonGridType"],
+        tma_descriptor_metadata: TMADescriptorMetadata,
+        kwargs: dict[str, Any],
+    ) -> Any:
+        return super().__call__(
+            kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grid,
+            tma_descriptor_metadata=tma_descriptor_metadata,
+            kwargs=kwargs,
+        )
+triton_kernel_wrapper_mutation = TritonKernelWrapperMutation()
+# Used for wrapping a Triton Kernel in a functional manner
+class TritonKernelWrapperFunctional(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("triton_kernel_wrapper_functional", cacheable=True)
+    def __call__(
+        self,
+        kernel_idx: int,
+        constant_args_idx: int,
+        grid: list["TritonGridType"],
+        tma_descriptor_metadata: TMADescriptorMetadata,
+        kwargs: dict[str, Any],
+        tensors_to_clone: list[str],
+    ) -> dict[str, Any]:
+        return super().__call__(
+            kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grid,
+            tma_descriptor_metadata=tma_descriptor_metadata,
+            kwargs=kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+triton_kernel_wrapper_functional = TritonKernelWrapperFunctional()
+@triton_kernel_wrapper_mutation.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_mutation_dense(
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+) -> None:
+    from torch._inductor.codegen.wrapper import user_defined_kernel_grid_fn_code
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
+    if len(grid) == 1:
+        grid_fn = grid[0]
+    else:
+        fn_name, code = user_defined_kernel_grid_fn_code(
+            kernel.fn.__name__, kernel.configs, grid
+        )
+        namespace: dict[str, Any] = {}
+        exec(code, namespace)
+        grid_fn = namespace[fn_name]
+    if tma_descriptor_metadata:
+        # as we need to launch the kernel here, we "unwrap" the
+        # tma_descriptor_metadata, create the TMA descriptors
+        # from it, and replace the tensors in the kwargs by the
+        # correspoinding TMA descriptors before launching
+        kwargs = kwargs.copy()
+        for k, v in tma_descriptor_metadata.items():
+            tensor = kwargs[k]
+            if (exp_meta := maybe_unpack_tma_experimental_metadata(v)) is not None:
+                from triton.tools.experimental_descriptor import (  # noqa: F401
+                    create_1d_tma_descriptor,
+                    create_2d_tma_descriptor,
+                )
+                dims, block_dims, element_size = exp_meta
+                create_tma_descriptor = (
+                    create_1d_tma_descriptor
+                    if len(dims) == 1
+                    else create_2d_tma_descriptor
+                )
+                kwargs[k] = create_tma_descriptor(
+                    tensor.data_ptr(),
+                    *dims,
+                    *block_dims,
+                    element_size,
+                )
+            else:
+                stable_meta = maybe_unpack_tma_stable_metadata(v)
+                assert stable_meta is not None
+                from triton.tools.tensor_descriptor import TensorDescriptor
+                block_shape = stable_meta[0]
+                kwargs[k] = TensorDescriptor.from_tensor(tensor, block_shape)
+    # move as many positional arguments from dicts to args as we
+    # can to circumvent the bug with the kwargs and pre_/post_hook:
+    # https://github.com/triton-lang/triton/issues/5082
+    # TODO: remove this when the Triton issue above is fixed
+    args = []
+    # copy kwargs and constant_args here to
+    # avoid mutating the original inputs
+    kwargs = kwargs.copy()
+    constant_args = constant_args.copy()
+    for name in kernel.arg_names:
+        if name in kwargs:
+            args.append(kwargs.pop(name))
+        elif name in constant_args:
+            args.append(constant_args.pop(name))
+        else:
+            break
+    kernel[grid_fn](*args, **kwargs, **constant_args)
+@triton_kernel_wrapper_mutation.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_mutation_fake_tensor_mode(
+    mode: FakeTensorMode,
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+) -> None:
+    with mode:
+        return None
+@triton_kernel_wrapper_mutation.py_impl(DispatchKey.Meta)
+def _(
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+) -> None:
+    return None
+def trace_triton_kernel_wrapper(
+    proxy_mode: ProxyTorchDispatchMode,
+    func_overload: Callable[..., Any],
+    node_args: dict[str, Any],
+) -> Optional[dict[str, Any]]:
+    with disable_proxy_modes_tracing():
+        out = func_overload(**node_args)
+    proxy_args = pytree.tree_map(
+        proxy_mode.tracer.unwrap_proxy, node_args  # type: ignore[union-attr]
+    )
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        func_overload,
+        (),
+        proxy_args,
+        name=func_overload.__name__ + "_proxy",
+    )
+    ret = track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+    return ret
+@triton_kernel_wrapper_mutation.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_mutation_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+) -> None:
+    trace_triton_kernel_wrapper(
+        mode,
+        triton_kernel_wrapper_mutation,
+        {
+            "kernel_idx": kernel_idx,
+            "constant_args_idx": constant_args_idx,
+            "grid": grid,
+            "tma_descriptor_metadata": tma_descriptor_metadata,
+            "kwargs": kwargs,
+        },
+    )
+    return None
+def get_mutated_tensors(
+    kernel_idx: int,
+    constant_args_idx: int,
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+) -> list[str]:
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
+    return identify_mutated_tensors(
+        kernel, {**kwargs, **constant_args}, tma_descriptor_metadata
+    )
+@triton_kernel_wrapper_mutation.py_functionalize_impl
+def triton_kernel_wrapper_mutation_functionalize(
+    ctx: "BaseFunctionalizeAPI",
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+) -> None:
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)  # type: ignore[arg-type]
+    # TODO(oulgen): Preexisting bug, if two kernel inputs are views of each
+    # other, and one gets mutated in kernel, and later another gets mutated,
+    # they are no longer equal. Fix this by graph breaking on this condition
+    # earlier in dynamo.
+    tensors_to_clone = get_mutated_tensors(
+        kernel_idx, constant_args_idx, unwrapped_kwargs, tma_descriptor_metadata
+    )
+    with ctx.redispatch_to_next():
+        unwrapped_outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grid,
+            tma_descriptor_metadata=tma_descriptor_metadata,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+    assert set(unwrapped_outputs.keys()).issubset(set(kwargs.keys()))
+    for key, output_arg in unwrapped_outputs.items():
+        if not isinstance(output_arg, Tensor):
+            continue
+        input_arg = kwargs[key]
+        assert isinstance(input_arg, Tensor)
+        ctx.replace(input_arg, output_arg)
+        # indicate that above replace is hidden from autograd
+        ctx.mark_mutation_hidden_from_autograd(input_arg)
+        ctx.commit_update(input_arg)
+        ctx.sync(input_arg)
+    return None
+@triton_kernel_wrapper_functional.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_functional_dense(
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+    tensors_to_clone: list[str],
+) -> dict[str, Any]:
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    kwargs = {
+        key: (clone_preserve_strides(val) if key in tensors_to_clone else val)
+        for key, val in kwargs.items()
+    }
+    triton_kernel_wrapper_mutation(
+        kernel_idx=kernel_idx,
+        constant_args_idx=constant_args_idx,
+        grid=grid,
+        tma_descriptor_metadata=tma_descriptor_metadata,
+        kwargs=kwargs,
+    )
+    return {key: val for key, val in kwargs.items() if key in tensors_to_clone}
+@triton_kernel_wrapper_functional.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_functional_fake_tensor_mode(
+    mode: FakeTensorMode,
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+    tensors_to_clone: list[str],
+) -> dict[str, Any]:
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    with mode:
+        return {
+            key: clone_preserve_strides(val)
+            for key, val in kwargs.items()
+            if key in tensors_to_clone
+        }
+@triton_kernel_wrapper_functional.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    *,
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+    tensors_to_clone: list[str],
+) -> dict[str, Any]:
+    ret = trace_triton_kernel_wrapper(
+        mode,
+        triton_kernel_wrapper_functional,
+        {
+            "kernel_idx": kernel_idx,
+            "constant_args_idx": constant_args_idx,
+            "grid": grid,
+            "tma_descriptor_metadata": tma_descriptor_metadata,
+            "kwargs": kwargs,
+            "tensors_to_clone": tensors_to_clone,
+        },
+    )
+    assert ret is not None
+    return ret
+@triton_kernel_wrapper_functional.py_functionalize_impl
+def triton_kernel_wrapper_functional_functionalize(
+    ctx: "BaseFunctionalizeAPI",
+    kernel_idx: int,
+    constant_args_idx: int,
+    grid: list["TritonGridType"],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+    kwargs: dict[str, Any],
+    tensors_to_clone: list[str],
+) -> dict[str, Any]:
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grid,
+            tma_descriptor_metadata=tma_descriptor_metadata,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+        return ctx.wrap_tensors(outputs)  # type: ignore[return-value,arg-type]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCPU)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)
+###############################################################################
+# The "TritonHOPifier": a class that transforms a call to a triton kernel into
+# a call to the triton_kernel_wrapper_mutation HOP.
+class TritonHOPifier:
+    """Orchestrator for converting a user-defined triton kernel into a call
+    to the triton_kernel_wrapper_mutation HOP.
+    It has two main use cases.
+    1. When Dynamo sees a triton kernel, it wraps it into a TritonKernelVariable
+    and uses the TritonHOPifier to convert calls to the TritonKernelVariable
+    into a call to the HOP.
+    2. In order to capture a user-defined triton kernel while performing
+    tracing (via make_fx or non-strict export), a user must annotate their
+    triton kernel with the `wrap_triton` decorator. The decorator uses
+    TritonHOPifier to convert calls to the triton kernel into a call
+    to the HOP (which can then be traced).
+    Because Dynamo has its own calling conventions for e.g. invoking a user-defined function
+    TritonHOPifier is an abstract class that can be overridden by its subclasses.
+    """
+    def raise_unsupported(self, msg: str) -> Never:
+        raise NotImplementedError("abstract method")
+    def is_callable(self, maybe_callable: Any) -> bool:
+        raise NotImplementedError("abstract method")
+    def get_value(self, val: Any) -> Any:
+        raise NotImplementedError("abstract method")
+    def call_grid(  # type: ignore[no-untyped-def]
+        self,
+        grid,
+        meta,
+        tx,
+    ) -> Union[tuple[Union[int, sympy.Expr, SymInt], ...], tuple["Proxy", ...]]:
+        raise NotImplementedError("abstract method")
+    def wrap_user_defined_obj(
+        self,
+        user_obj: Any,
+        tx: Optional["InstructionTranslator"],
+        variable: Optional[
+            Union["TritonKernelVariable", "TraceableTritonKernelWrapper"]
+        ],
+        name: str,
+    ) -> Any:
+        raise NotImplementedError("abstract method")
+    def call_user_defined_fn(
+        self,
+        user_fn: Callable[..., Any],
+        args: list,
+        kwargs: dict,
+        tx: Optional["InstructionTranslator"],
+        variable: Optional[
+            Union["TritonKernelVariable", "TraceableTritonKernelWrapper"]
+        ],
+    ) -> Any:
+        raise NotImplementedError("abstract method")
+    def maybe_unpack_configs(
+        self, configs: list["TritonConfig"], tx: Optional["InstructionTranslator"]
+    ) -> list["TritonConfig"]:
+        raise NotImplementedError("abstract method")
+    def maybe_unpack_heuristic_result(self, result: Any) -> Any:
+        raise NotImplementedError("abstract method")
+    @staticmethod
+    def do_prune_configs(  # type: ignore[no-untyped-def]
+        autotuner: "TritonAutotunerType",
+        early_config_prune: Optional[Callable],
+        perf_model: Optional[Callable],
+        top_k: float,
+        configs: list,
+        named_args: dict,
+        kwargs: dict,
+    ) -> list["TritonConfig"]:
+        # Reimplement autotuner.prune_configs(...) here
+        # see: https://github.com/triton-lang/triton/blob/e57b46897191b3b3061c78d0d60e58e94be565b6/python/triton/runtime/autotuner.py   # noqa: E501,B950
+        # We do this to avoid calling prune_configs, which in turn calls early_config_prune and perf_model
+        # These are both user-defined functions which can contain side effects, so we want to sandbox them in Dynamo
+        if early_config_prune:
+            configs = early_config_prune(configs, named_args, **kwargs)
+        if perf_model:
+            # we assert top_k is a float before calling this
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(configs) * top_k)
+            elif not isinstance(top_k, int):
+                """
+                Slice index must be an integer, SupportsIndex or None
+                """
+                raise TypeError(
+                    "Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int"
+                )
+            if len(configs) > top_k:
+                est_timing = [
+                    (
+                        config,
+                        float(
+                            perf_model(**named_args, **kwargs, **config.all_kwargs())
+                        ),
+                    )
+                    for config in configs
+                ]
+                configs = [
+                    config[0]
+                    for config in sorted(est_timing, key=operator.itemgetter(1))[:top_k]
+                ]
+        return configs
+    def call_HOP(  # type: ignore[no-untyped-def]
+        self,
+        variable,
+        grids,
+        combined_args: dict[str, Any],
+        tx,
+    ) -> Optional["ConstantVariable"]:
+        raise NotImplementedError("abstract method")
+    def check_grid(  # type: ignore[no-untyped-def]
+        self, grid
+    ) -> Union[tuple[Union[int, sympy.Expr, SymInt], ...], tuple["Proxy", ...]]:
+        raise NotImplementedError("abstract method")
+    def init_variable(
+        self,
+        variable: Union["TraceableTritonKernelWrapper", "TritonKernelVariable"],
+        kernel: "TritonKernelType",
+        kernel_idx: Optional[int],
+        grid: Optional["TritonGridType"],
+    ) -> None:
+        from triton.runtime.autotuner import Autotuner
+        assert kernel is not None
+        variable.kernel = kernel
+        variable.kernel_idx = kernel_side_table.add_kernel(kernel)
+        assert kernel_idx is None or variable.kernel_idx == kernel_idx
+        variable.grid = grid
+        if isinstance(kernel, Autotuner):
+            import torch
+            import torch._dynamo
+            # We only support configs, keys, and restore_value arguments
+            # of triton.autotune. Make sure other arguments are defaulted.
+            defaults = inspect.signature(Autotuner.__init__).parameters
+            # Newer version of triton change attribute name from warmup to num_warmup and rep to num_rep.
+            # The call to get_first_attr is to maintain backward-compatibility.
+            def defaults_ok(
+                attr: str, alternates: tuple[str, ...], values: tuple[Any, ...]
+            ) -> bool:
+                if attr not in defaults:
+                    return True
+                value = torch._dynamo.utils.get_first_attr(kernel, attr, *alternates)
+                if value == defaults[attr].default:
+                    return True
+                return value in values
+            if (
+                not torch._inductor.config.unsafe_ignore_unsupported_triton_autotune_args
+                and (
+                    not defaults_ok("num_warmups", ("warmup",), (25, None))
+                    or not defaults_ok("num_reps", ("rep",), (100, None))
+                    or not defaults_ok("use_cuda_graph", (), (False,))
+                )
+            ):
+                self.raise_unsupported(
+                    "Only configs, keys, restore_value, and reset_to_zero are supported for triton.autotune"
+                )
+            if (
+                not torch._inductor.config.unsafe_ignore_unsupported_triton_autotune_args
+                and (
+                    # pre_hook requires running arbitrary code at runtime, which we cannot handle at this time
+                    # https://github.com/pytorch/pytorch/issues/139059
+                    # we can't support pre_hook or post_hook in user defined triton kernels at the moment,
+                    # as they require the ability to execute code at runtime (AOTI can't support this)
+                    (
+                        hasattr(kernel, "user_defined_pre_hook")
+                        and kernel.user_defined_pre_hook is not False
+                    )
+                    or (
+                        hasattr(kernel, "user_defined_post_hook")
+                        and kernel.user_defined_post_hook is not False
+                    )
+                    or (
+                        # Check Config passed to autotuner in configs
+                        any(cfg.pre_hook is not None for cfg in kernel.configs)
+                    )
+                )
+            ):
+                self.raise_unsupported(
+                    "pre_hook and post_hook are not supported in triton.Autotune or triton.Config"
+                )
+    def call_getitem(
+        self,
+        variable: Union["TritonKernelVariable", "TraceableTritonKernelWrapper"],
+        args: Sequence[Any],
+    ) -> Union["TritonKernelVariable", "TraceableTritonKernelWrapper"]:
+        # __getitem__ should only be called if we don't already have a grid
+        # Only grid needs to be passed
+        if variable.grid is not None or len(args) != 1:
+            self.raise_unsupported(
+                "Triton kernels should be called with only a single grid"
+            )
+        return type(variable)(
+            kernel=variable.kernel,
+            kernel_idx=variable.kernel_idx,
+            grid=args[0],
+        )
+    def call_run(
+        self,
+        variable: Union["TritonKernelVariable", "TraceableTritonKernelWrapper"],
+        args: Sequence[Any],
+        kwargs: dict[str, Any],
+        tx: Optional["InstructionTranslator"],
+    ) -> Optional["ConstantVariable"]:
+        if "grid" not in kwargs:
+            self.raise_unsupported("Triton kernel requires to be called with a grid")
+        grid = kwargs.pop("grid")
+        kwargs.pop("warmup", None)
+        # rewrite kernel.run(*args, grid=grid) to kernel[grid](*args)
+        return self.call_triton_kernel(
+            type(variable)(
+                kernel=variable.kernel, kernel_idx=variable.kernel_idx, grid=grid
+            ),
+            args,
+            kwargs,
+            tx,
+        )
+    def call_triton_kernel(
+        self,
+        variable: Union["TritonKernelVariable", "TraceableTritonKernelWrapper"],
+        args: Sequence[Any],
+        kwargs: dict[str, Any],
+        tx: Optional["InstructionTranslator"],
+    ) -> Optional["ConstantVariable"]:
+        from triton import JITFunction
+        from triton.runtime.autotuner import autotune, Autotuner, Config, Heuristics
+        # Check if num_ctas is in kwargs
+        if "num_ctas" in kwargs:
+            self.raise_unsupported(
+                "Passing num_ctas directly to the Triton kernel is not supported. "
+                "Please use a Config in @triton.autotune instead."
+            )
+        # Make sure the kernel has a grid
+        if variable.grid is None:
+            self.raise_unsupported("Triton kernels should always be called with a grid")
+        # raise an exception if there are multiple @triton.autotune decorators
+        iter_kernel = variable.kernel
+        autotuner_count = 0
+        while not isinstance(iter_kernel, JITFunction):
+            if isinstance(iter_kernel, Autotuner):
+                autotuner_count += 1
+            if autotuner_count > 1:
+                self.raise_unsupported(
+                    "Passing multiple @triton.autotune decorators is not supported. "
+                    "Please use a single @triton.autotune decorator instead."
+                )
+            iter_kernel = iter_kernel.fn
+        # Process the @triton.heuristics decorator:
+        # - We know there is only 1 autotuner decorator here
+        # - We can apply the heuristic to all triton.Configs in the order that the decorators appear
+        #   This way, when the config is selected, the heuristics have already been applied.
+        # - Decorators that appear *before* the autotuner are already processed correctly
+        if isinstance(variable.kernel, Autotuner) and isinstance(
+            variable.kernel.fn, Heuristics
+        ):
+            # unwrap the heuristics decorator, we don't need it anymore
+            # variable.kernel ==> Autotuner
+            # variable.kernel.fn ==> Heuristics
+            # ...
+            # There can be arbitrarily many heuristics wrappers here!
+            # ...
+            # variable.kernel.fn ==> JITFunction
+            # Copy the configs, we are going to be modifying them
+            new_configs = copy.deepcopy(variable.kernel.configs)
+            named_args = dict(zip(variable.kernel.arg_names, args))
+            # Iterate through all of the heuristics wrappers that come after the autotune wrapper
+            iter_kernel = variable.kernel.fn
+            while isinstance(iter_kernel, Heuristics):
+                # For each config, apply the heuristic fn(s)
+                for config_idx in range(len(new_configs)):
+                    for kwarg_key, heuristic_fn in iter_kernel.values.items():
+                        # Run heuristics on the combined configs + kwargs
+                        heuristic_result = self.call_user_defined_fn(
+                            heuristic_fn,
+                            [
+                                {
+                                    **named_args,
+                                    **kwargs,
+                                    **new_configs[config_idx].__dict__["kwargs"],
+                                },
+                            ],
+                            {},
+                            tx,
+                            variable,
+                        )
+                        # Update the kwargs in each config
+                        # maybe_unpack_heuristic_result raises unsupported if the value is non-constant
+                        new_configs[config_idx].__dict__["kwargs"][
+                            kwarg_key
+                        ] = self.maybe_unpack_heuristic_result(heuristic_result)
+                iter_kernel = iter_kernel.fn
+            assert isinstance(iter_kernel, JITFunction)
+            prune_configs_by = {
+                "perf_model": variable.kernel.perf_model,
+                "early_config_prune": variable.kernel.early_config_prune,
+                "configs_top_k": variable.kernel.configs_top_k,
+            }
+            new_kernel = autotune(
+                configs=new_configs, key=[], prune_configs_by=prune_configs_by
+            )(iter_kernel)
+            # create a new variable to contain the new (wrapped) kernel;
+            # skip kernel_idx to get a new record in the kernel side table
+            new_var = type(variable)(new_kernel, None, variable.grid)
+            return self.call_triton_kernel(new_var, args, kwargs, tx)
+        SPECIAL_CONFIG_NAMES = {
+            "num_warps",
+            "num_stages",
+            "num_ctas",
+            "num_consumer_groups",
+            "num_buffers_warp_spec",
+        }
+        # move special config names to configs out of kwargs
+        special_kwargs = {}
+        for name in SPECIAL_CONFIG_NAMES:
+            if name in kwargs:
+                # remove special kwargs from `kwargs`
+                val = kwargs.pop(name)
+                special_kwargs[name] = self.get_value(val)
+        if special_kwargs:
+            if isinstance(variable.kernel, Autotuner):
+                # if there is Autotuner already, set
+                # special kwargs to each of its configs
+                new_configs = copy.deepcopy(variable.kernel.configs)
+                for config in new_configs:
+                    config.__dict__.update(special_kwargs)
+                prune_configs_by = {
+                    "perf_model": variable.kernel.perf_model,
+                    "early_config_prune": variable.kernel.early_config_prune,
+                    "configs_top_k": variable.kernel.configs_top_k,
+                }
+                new_kernel = autotune(
+                    configs=new_configs, key=[], prune_configs_by=prune_configs_by
+                )(variable.kernel.fn)
+            else:
+                # if there is no Autotuner, wrap the kernel into a
+                # new one with a single config with special kwargs
+                new_config = Config(kwargs={}, **special_kwargs)
+                new_kernel = autotune(configs=[new_config], key=[])(variable.kernel)
+            # create a new variable to contain the new (wrapped) kernel;
+            # skip kernel_idx to get a new record in the kernel side table
+            new_var = type(variable)(new_kernel, None, variable.grid)
+            return self.call_triton_kernel(new_var, args, kwargs, tx)
+        if isinstance(variable.kernel, Autotuner):
+            special_param_names = []
+            for name in SPECIAL_CONFIG_NAMES:
+                if name in variable.kernel.fn.arg_names:
+                    special_param_names.append(name)
+            if special_param_names:
+                # If the Triton kernel has SPECIAL_CONFIG_NAMES in parameters, those should
+                # be passed from the kernel configs: the behavior of Triton runtime is that
+                # those values get folded into the kernel arguments iff there are parameters
+                # with the same name. Normally the values of those parameters are defined
+                # outside the `kwargs` part of the autotuning configs. Here we move them to
+                # the `kwargs` part (if they're absent there) to facilitate passing them as
+                # arguments to the kernel downstream.
+                updated = False
+                new_configs = copy.deepcopy(variable.kernel.configs)
+                for config in new_configs:
+                    for name in special_param_names:
+                        if name not in config.__dict__["kwargs"]:
+                            assert (
+                                name in config.__dict__
+                            ), f"{name} must be in autotuning configs to be used as a kernel parameter"
+                            config.__dict__["kwargs"][name] = config.__dict__[name]
+                            updated = True
+                if updated:
+                    prune_configs_by = {
+                        "perf_model": variable.kernel.perf_model,
+                        "early_config_prune": variable.kernel.early_config_prune,
+                        "configs_top_k": variable.kernel.configs_top_k,
+                    }
+                    new_kernel = autotune(
+                        configs=new_configs, prune_configs_by=prune_configs_by, key=[]
+                    )(variable.kernel.fn)
+                    new_var = type(variable)(new_kernel, None, variable.grid)
+                    return self.call_triton_kernel(new_var, args, kwargs, tx)
+        # These are the default values in upstream Triton
+        # see: https://github.com/triton-lang/triton/blob/e57b46897191b3b3061c78d0d60e58e94be565b6/python/triton/runtime/autotuner.py # noqa: E501,B950
+        default_perf_model = None
+        default_early_config_prune = None
+        # run prune_configs_by
+        if isinstance(variable.kernel, Autotuner) and (
+            variable.kernel.perf_model != default_perf_model
+            or variable.kernel.early_config_prune != default_early_config_prune
+        ):
+            # Prune the configs
+            named_args = dict(zip(variable.kernel.arg_names, args))
+            # The source information is important here so the guards are installed correctly
+            wrapped_early_configs_prune = self.wrap_user_defined_obj(
+                variable.kernel.early_config_prune,
+                tx,
+                variable,
+                "early_config_prune",
+            )
+            wrapped_perf_model = self.wrap_user_defined_obj(
+                variable.kernel.perf_model, tx, variable, "perf_model"
+            )
+            wrapped_configs_top_k = self.wrap_user_defined_obj(
+                variable.kernel.configs_top_k, tx, variable, "configs_top_k"
+            )
+            wrapped_configs = self.wrap_user_defined_obj(
+                variable.kernel.configs, tx, variable, "configs"
+            )
+            pruned_configs = self.call_user_defined_fn(
+                self.do_prune_configs,
+                [
+                    variable,
+                    wrapped_early_configs_prune,
+                    wrapped_perf_model,
+                    wrapped_configs_top_k,
+                    wrapped_configs,
+                    named_args,
+                    kwargs,
+                ],
+                {},
+                tx,
+                variable,
+            )
+            pruned_configs = self.maybe_unpack_configs(pruned_configs, tx)
+            # after pruning the configs, create a new autotuner object with
+            # these configs and recurse.
+            new_kernel = autotune(configs=pruned_configs, key=[])(variable.kernel.fn)
+            # create a new variable to contain the new (wrapped) kernel;
+            # skip kernel_idx to get a new record in the kernel side table
+            new_var = type(variable)(new_kernel, None, variable.grid)
+            return self.call_triton_kernel(new_var, args, kwargs, tx)
+        # Both for grid's meta as well as for the kernel, we need combined
+        # args and kwargs combined and normalized
+        combined_args_raw = {**dict(zip(variable.kernel.arg_names, args)), **kwargs}
+        # precompute the grid for the kernel
+        configs = (
+            [config.kwargs for config in variable.kernel.configs]
+            if isinstance(variable.kernel, Autotuner)
+            else [{}]
+        )
+        grids = []
+        for config_args in configs:
+            # If the grid is a function, then lets execute it and convert it to
+            # a list
+            grid = variable.grid
+            assert grid is not None
+            if self.is_callable(grid):
+                # Populate the special "meta" argument to call the grid function
+                meta = {**combined_args_raw, **config_args}
+                grid = self.call_grid(grid, meta, tx)  # type: ignore[arg-type]
+            grids.append(self.check_grid(grid))
+        for i in range(len(grids)):
+            if not isinstance(grids[i], tuple):
+                self.raise_unsupported("Only tuple grids are supported")
+            # inductor expects all grids to be 3-tuple so lets make it
+            if len(grids[i]) == 1:
+                grids[i] = (grids[i][0], 1, 1)
+            elif len(grids[i]) == 2:
+                grids[i] = (grids[i][0], grids[i][1], 1)
+            elif len(grids[i]) > 3:
+                self.raise_unsupported("Grid can have at most rank 3")
+        assert len(grids) != 0
+        if isinstance(variable.kernel, JITFunction):
+            constexprs = variable.kernel.constexprs
+        else:
+            # If we are looking at an @triton.autotune decorator, the nested function should be a JITFunction
+            # This is because we don't support @triton.heuristics or nested @triton.autotune decorators yet
+            assert isinstance(variable.kernel, Autotuner)
+            constexprs = variable.kernel.fn.constexprs
+        for idx, arg_name in enumerate(variable.kernel.arg_names):
+            if idx in constexprs:
+                if arg_name in combined_args_raw:
+                    # [Note: Specialize tl.constexpr args in user-defined triton kernels]
+                    # This arg is marked as tl.constexpr. That means that triton will recompile every time
+                    # this value changes.
+                    # https://github.com/pytorch/pytorch/issues/136504
+                    # One option is to correctly pass the symints in so that the symbolic expressions are defined
+                    # when the triton code is being executed.
+                    # But since triton will have to recompile either way, we instead just specialize on the value.
+                    #
+                    # Depending on the type of `variable` we might expect different types for the symbolic args:
+                    # either SymNodeVariables (for TritonKernelVariables) or SymInts (TracingTritonKernelWrapper)
+                    combined_args_raw[arg_name] = variable.specialize_symbolic(
+                        combined_args_raw[arg_name]
+                    )
+        return self.call_HOP(variable, grids, combined_args_raw, tx)
+###############################################################################
+# Helpers for wrap_triton API that makes a user-defined triton kernel traceable into
+# a graph via make_fx or non-strict export (coming soon)
+class TracingTritonHOPifier(TritonHOPifier):
+    def raise_unsupported(self, msg: str) -> Never:
+        raise RuntimeError(msg)
+    def is_callable(self, maybe_callable: Any) -> bool:
+        return callable(maybe_callable)
+    def get_value(self, val: Any) -> Any:
+        return val
+    def call_grid(
+        self,
+        grid: "TritonGridCallableType",
+        meta: "TritonMetaParamsType",
+        tx: None,
+    ) -> tuple[Union[int, sympy.Expr, SymInt], ...]:
+        assert tx is None
+        assert isinstance(meta, dict)
+        assert callable(grid)
+        return grid(meta)
+    def wrap_user_defined_obj(
+        self,
+        user_obj: Any,
+        tx: Optional["InstructionTranslator"],
+        variable: Optional[
+            Union["TritonKernelVariable", "TraceableTritonKernelWrapper"]
+        ],
+        name: str,
+    ) -> Any:
+        assert tx is None
+        return user_obj
+    def call_user_defined_fn(
+        self,
+        user_fn: Callable[..., Any],
+        args: list,
+        kwargs: dict,
+        tx: Optional["InstructionTranslator"],
+        variable: Optional[
+            Union["TritonKernelVariable", "TraceableTritonKernelWrapper"]
+        ],
+    ) -> Any:
+        assert isinstance(args, list)
+        assert isinstance(kwargs, dict)
+        assert callable(user_fn)
+        return user_fn(*args, **kwargs)
+    def maybe_unpack_configs(
+        self, configs: list["TritonConfig"], tx: Optional["InstructionTranslator"]
+    ) -> list["TritonConfig"]:
+        assert isinstance(configs, list)
+        return configs
+    def maybe_unpack_heuristic_result(self, result: Any) -> Any:
+        return result
+    def check_grid(
+        self,
+        grid: "TritonGridType",
+    ) -> tuple[Union[int, sympy.Expr, SymInt], ...]:
+        if not isinstance(grid, collections.abc.Sequence):
+            raise RuntimeError(
+                "wrap_triton can only handle grids that resolve to Sequence[int]."
+            )
+        # normalize to tuple
+        return tuple(grid)
+    def store_non_graphable_args(
+        self,
+        combined_args: dict[str, Any],
+    ) -> tuple[dict, int]:
+        """
+        Some args cannot be stored in the FX graph.
+        Put them in the side table.
+        """
+        def is_graphable(val: Any) -> bool:
+            return isinstance(val, (fx.node.base_types, fx.Node))
+        non_graphable_args = {
+            k: v for k, v in combined_args.items() if not is_graphable(v)
+        }
+        graphable_args = {k: v for k, v in combined_args.items() if is_graphable(v)}
+        constant_args_idx = kernel_side_table.add_constant_args(non_graphable_args)
+        return graphable_args, constant_args_idx
+    def call_HOP(
+        self,
+        variable: "TraceableTritonKernelWrapper",
+        grids: list["TritonGridTupleType"],
+        combined_args: dict[str, Any],
+        tx: None,
+    ) -> None:
+        assert tx is None
+        assert isinstance(variable, TraceableTritonKernelWrapper)
+        graphable_args, constant_args_idx = self.store_non_graphable_args(combined_args)
+        assert isinstance(variable.kernel_idx, int)
+        return triton_kernel_wrapper_mutation(
+            kernel_idx=variable.kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grids,  # type: ignore[arg-type]
+            # TMA descriptor capturing not yet
+            # supported in non-dynamo tracing
+            tma_descriptor_metadata={},
+            kwargs=graphable_args,
+        )
+tracing_triton_hopifier_singleton = TracingTritonHOPifier()
+class TraceableTritonKernelWrapper:
+    kernel: "TritonKernelType"
+    kernel_idx: Optional[int]
+    grid: Optional["TritonGridType"]
+    def __init__(
+        self,
+        kernel: "TritonKernelType",
+        kernel_idx: Optional[int],
+        grid: Optional["TritonGridType"],
+    ) -> None:
+        self.kernel = None
+        self.grid = None
+        tracing_triton_hopifier_singleton.init_variable(self, kernel, kernel_idx, grid)
+        assert self.kernel is not None
+    def __getitem__(self, *args: Sequence[Any]) -> "TraceableTritonKernelWrapper":
+        return tracing_triton_hopifier_singleton.call_getitem(self, args)  # type: ignore[return-value]
+    def run(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
+        from torch._library.triton import is_wrap_triton_enabled
+        if is_wrap_triton_enabled():
+            return tracing_triton_hopifier_singleton.call_run(self, args, kwargs, None)
+        else:
+            assert self.kernel is not None
+            return self.kernel.run(*args, **kwargs)
+    def __call__(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
+        from torch._library.triton import is_wrap_triton_enabled
+        if is_wrap_triton_enabled():
+            return tracing_triton_hopifier_singleton.call_triton_kernel(
+                self, args, kwargs, None
+            )
+        else:
+            assert self.kernel is not None
+            return self.kernel[self.grid](*args, **kwargs)
+    def specialize_symbolic(self, arg: Sequence[Any]) -> Any:
+        import torch
+        # See [Note: Specialize tl.constexpr args in user-defined triton kernels]
+        if isinstance(arg, (torch.SymInt, torch.SymBool, torch.SymFloat)):
+            return guard_scalar(arg)
+        return arg

archive/.venv/Lib/site-packages/torch/_higher_order_ops/utils.py ADDED Viewed

	@@ -0,0 +1,1134 @@

+# mypy: allow-untyped-defs
+import contextlib
+import functools
+from contextlib import contextmanager, ExitStack, nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+import torch
+import torch.fx.traceback as fx_traceback
+import torch.utils._pytree as pytree
+from torch._dispatch.python import suspend_functionalization
+from torch._guards import detect_fake_mode
+from torch._higher_order_ops.schema import HopSchema
+from torch._ops import HigherOrderOperator, OperatorBase, OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
+    FunctionalTensor,
+)
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    disable_proxy_modes_tracing,
+    make_fx,
+)
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.multiprocessing.reductions import StorageWeakRef
+@dataclass
+class UnsupportedAliasMutationException(RuntimeError):
+    reason: str
+def autograd_not_implemented_inner(
+    operator: OperatorBase, delayed_error: bool, *args: Any, **kwargs: Any
+) -> Any:
+    """If autograd is enabled and any of the arguments require grad this will either
+    raise an error or return a DelayedError depending on the value of delayed.
+    Args:
+        operator: The Operator to call with the *args and **kwargs with
+        op_name: The name of the Operator
+        delayed_error: If True, return a DelayedError instead of raising an error
+        args: The flattened operands to the Operator
+        kwargs: The keyword arguments to the Operator
+    Raises:
+        RuntimeError: If autograd is enabled and any of the arguments to the Operator
+    """
+    with torch._C._AutoDispatchBelowAutograd():
+        result = operator(*args, **kwargs)
+        flat_operands = pytree.arg_tree_leaves(*args)
+        if torch.is_grad_enabled() and any(
+            f.requires_grad for f in flat_operands if isinstance(f, torch.Tensor)
+        ):
+            if delayed_error:
+                err_fn = torch._C._functions.DelayedError(
+                    f"Autograd not implemented for {str(operator)}",
+                    1,
+                )
+                def fake_requires_grad(tensor):
+                    if torch.is_floating_point(tensor) or torch.is_complex(tensor):
+                        tensor = tensor.detach()
+                        tensor.requires_grad = True
+                    return tensor
+                return pytree.tree_map_only(
+                    torch.Tensor, lambda x: err_fn(fake_requires_grad(x)), result
+                )
+            else:
+                raise RuntimeError(f"Autograd not implemented for {str(operator)}")
+        return result
+def autograd_not_implemented(op: OperatorBase, deferred_error: bool) -> Callable:
+    def inner(*args, **kwargs):
+        return autograd_not_implemented_inner(op, deferred_error, *args, **kwargs)
+    return inner
+def _maybe_run_with_interpreter(fn):
+    maybe_interpreted_fn = fn
+    if isinstance(fn, torch.fx.GraphModule) and fx_traceback.has_preserved_node_meta():
+        # Running graph with interpreter is needed for propagating the stack_trace
+        def graph_with_interpreter(*args):
+            with fx_traceback.preserve_node_meta():
+                return torch.fx.Interpreter(fn).run(*args)
+        maybe_interpreted_fn = graph_with_interpreter
+    return maybe_interpreted_fn
+def _maybe_compile_and_run_fn(fn, *args):
+    if not torch.compiler.is_dynamo_compiling():
+        from torch._dynamo.backends.debugging import (
+            make_eager_backend_with_torch_function_mode,
+        )
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                else:
+                    backend = "eager"
+                return torch.compile(fn, backend=backend, fullgraph=True)(*args)
+    else:
+        return fn(*args)
+def reenter_make_fx(fn):
+    from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
+    @functools.wraps(fn)
+    def wrapped(*args):
+        assert (
+            _CURRENT_MAKE_FX_TRACER is not None
+        ), "Cannot reenter make_fx when we're not under a make_fx tracing session"
+        return _CURRENT_MAKE_FX_TRACER.trace_subgraph(
+            _maybe_run_with_interpreter(fn), *args
+        )
+    return wrapped
+def _maybe_reenter_make_fx(fn):
+    from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
+    if _CURRENT_MAKE_FX_TRACER is not None:
+        return reenter_make_fx(fn)
+    else:
+        def _maybe_make_fx_with_fake_mode(fn):
+            @functools.wraps(fn)
+            def wrapped(*args):
+                from torch._guards import detect_fake_mode
+                fake_mode = detect_fake_mode(args)
+                if fake_mode is None:
+                    # we creaeta a fake_mode here to make sure we could
+                    # trace the graph with data-dependent calls e.g. .item()
+                    return make_fx(fn, tracing_mode="fake")(*args)
+                # Tracing with real if all inputs have been fakfied
+                return make_fx(fn)(*args)
+            return wrapped
+        return _maybe_make_fx_with_fake_mode(fn)
+def check_meta_consistency(
+    lhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    rhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    lhs_name: str,
+    rhs_name: str,
+    include_contiguity: bool = True,
+) -> None:
+    def diff_meta_pairs(
+        lhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+        rhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    ) -> list[str]:
+        def diff_meta(
+            lhs: Union[torch.Tensor, torch.SymInt, int],
+            rhs: Union[torch.Tensor, torch.SymInt, int],
+        ) -> str:
+            if isinstance(lhs, torch.Tensor) and isinstance(rhs, torch.Tensor):
+                return ", ".join(
+                    diff_tensor_meta(
+                        _extract_tensor_metadata(
+                            lhs, include_contiguity=include_contiguity
+                        ),
+                        _extract_tensor_metadata(
+                            rhs, include_contiguity=include_contiguity
+                        ),
+                        check_grad=False,
+                    )
+                )
+            else:
+                def _both_int_types(lhs, rhs):
+                    return isinstance(lhs, (int, torch.SymInt)) and isinstance(
+                        rhs, (int, torch.SymInt)
+                    )
+                def _both_tensor(lhs, rhs):
+                    return isinstance(lhs, torch.Tensor) and isinstance(
+                        rhs, torch.Tensor
+                    )
+                if not _both_int_types(lhs, rhs) and not _both_tensor(lhs, rhs):
+                    return f"type: {lhs} vs {rhs}"
+            return ""
+        # Manually check the device of lhs and rhs as this field is currently not part of TensorMetadata
+        def diff_device(
+            lhs: Union[torch.Tensor, torch.SymInt, int],
+            rhs: Union[torch.Tensor, torch.SymInt, int],
+        ) -> str:
+            if isinstance(lhs, torch.Tensor) and isinstance(rhs, torch.Tensor):
+                if (
+                    rhs.device.type == lhs.device.type
+                    and rhs.device.index == lhs.device.index
+                ):
+                    return ""
+                else:
+                    return "device"
+            return ""
+        if len(lhs_list) != len(rhs_list):
+            raise torch._dynamo.exc.UncapturedHigherOrderOpError(
+                f"Expected {lhs_name} and {rhs_name} to have same number of outputs but got lhs:{lhs_list} and rhs:{rhs_list}"
+            )
+        all_diffs = []
+        for i, (lhs, rhs) in enumerate(zip(lhs_list, rhs_list)):
+            if diff := diff_meta(lhs, rhs):
+                all_diffs.append(
+                    f"pair[{i}] differ in {diff}, where lhs is {lhs} and rhs is {rhs}"
+                )
+            if diff := diff_device(lhs, rhs):
+                all_diffs.append(
+                    f"pair[{i}] differ in {diff}, where lhs is {lhs} and rhs is {rhs}"
+                )
+        return all_diffs
+    if all_diffs := diff_meta_pairs(lhs_list, rhs_list):
+        diff_str = "\n".join(all_diffs)
+        raise torch._dynamo.exc.UncapturedHigherOrderOpError(
+            f"Expected {lhs_name} and {rhs_name} to have same metadata but found:\n{diff_str}"
+        )
+@contextmanager
+def _set_compilation_env():
+    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
+    _old_allow_empty_graphs = torch._dynamo.config.allow_empty_graphs
+    # The issue is tracked in https://github.com/pytorch/pytorch/issues/144360: when dynamo finds
+    # the top-level frame produces no graph, the default behavior is to fallback to eager.
+    # Then when it encounters an inner function, it will try to trace that function again, which is unnecessary.
+    # For while_loop, during inspecting the inner call, we trace into the python dispathcer
+    # logic, which is not tracable as of today. So the proper fix can be either 1. allow dispatch
+    # logic to be dynamo tracable or 2. fixing https://github.com/pytorch/pytorch/issues/144360.
+    # but it exposes some bugs in existing tests so we have to have a temporary flag to control
+    # the behavior, which allows dynamo to store an empty graph for a frame without falling back to eager
+    try:
+        # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
+        # once we are confident fx tracing works with dynamo.
+        torch.fx._symbolic_trace._is_fx_tracing_flag = False
+        torch._dynamo.config.allow_empty_graphs = True
+        yield
+    finally:
+        torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
+        torch._dynamo.config.allow_empty_graphs = _old_allow_empty_graphs
+# The invariant here is that we always trace the branch with fake tensor
+def _maybe_fake_tracing(fn, inputs: list[Any], pre_dispatch):
+    fake_mode = detect_fake_mode(inputs)
+    tracing_mode = "real"
+    if fake_mode is None:
+        fake_mode = nullcontext()
+        tracing_mode = "fake"
+    # Note: we need to turn off proxy tensor mode to avoid tracing infra
+    # code that happens in make_fx e.g. we now call as_strided when wrapping tensor
+    # as fake tensor.
+    with fake_mode, disable_proxy_modes_tracing():
+        gm = make_fx(
+            fn,
+            tracing_mode=tracing_mode,
+            pre_dispatch=pre_dispatch,
+            _error_on_data_dependent_ops=False,
+        )(*inputs)
+        if not isinstance(fake_mode, nullcontext) and fake_mode.shape_env is not None:
+            insert_deferred_runtime_asserts(
+                gm, fake_mode.shape_env, "hoo_maybe_fake_tracing", export=True
+            )
+        return gm
+def potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
+    try:
+        gm = _maybe_fake_tracing(gm, inputs, pre_dispatch)
+    except UnsupportedAliasMutationException:
+        # this can happen when nested cond_op is
+        # functionalized
+        return True
+    except Exception as e:
+        raise e
+    example_inputs = [
+        ph.meta.get("val", None) for ph in gm.graph.find_nodes(op="placeholder")
+    ]
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+        inp_mutation,
+    ) = check_input_alias_and_mutation(gm, example_inputs)
+    return (inp_inp_alias_map, inp_out_alias_map, out_out_alias_map), inp_mutation
+def analyze_potential_input_alias_or_mutation(name, aliases, input_mutations):
+    if any(len(a) > 0 for a in aliases):
+        # TODO: Investigate here further which node is exactly aliasing
+        raise RuntimeError(
+            f"{name} where aliases appear. "
+            + f"In particular, these inputs \
+            {set(el for el_map in aliases if len(el_map.keys()) > 0 for el in el_map.keys())} "  # noqa: C401
+            + "get aliased. Please ensure that this doesn't happen."
+        )
+    if len(input_mutations):
+        # TODO: Investigate here further which node is exactly mutating the inputs
+        raise RuntimeError(
+            f"{name} where the inputs are mutated. "
+            + f"In particular, these nodes are mutating the inputs \
+            {set(el for el in input_mutations)}."  # noqa: C401
+            + "Please ensure that this doesn't happen."
+        )
+def _has_potential_branch_input_mutation(gm, inputs, pre_dispatch=False):
+    (
+        _,
+        _,
+        _,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+    return len(inp_mutation) > 0
+def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+    return (
+        any(
+            (
+                len(inp_inp_alias_map) > 0,
+                len(inp_out_alias_map) > 0,
+                len(out_out_alias_map) > 0,
+            )
+        ),
+        len(inp_mutation) > 0,
+    )
+def _collect_fake_inputs(inputs):
+    from torch._subclasses.fake_tensor import FakeTensor
+    # Get the example values of the inputs.
+    inputs_fake: list[Union[FakeTensor, torch.Tensor, int]] = []
+    for inp in inputs:
+        if isinstance(inp, (torch.fx.proxy.Proxy, torch.fx.node.Node)):
+            inp = inp.node if isinstance(inp, torch.fx.proxy.Proxy) else inp
+            if hasattr(inp, "meta"):
+                val = inp.meta["example_value"]
+                if isinstance(val, torch.Tensor):
+                    if torch._C._functorch.is_batchedtensor(
+                        val
+                    ) or torch._C._functorch.is_functionaltensor(val):
+                        # This case is for batched or functional tensors
+                        # Unwrap the tensors
+                        while torch._C._functorch.is_batchedtensor(
+                            val
+                        ) or torch._C._functorch.is_functionaltensor(val):
+                            val = torch._C._functorch.get_unwrapped(val)
+                        assert isinstance(val, FakeTensor)
+                        inputs_fake.append(val)
+                    else:
+                        # This is the standard case of a TensorVariable
+                        assert isinstance(val, FakeTensor)
+                        inputs_fake.append(val)
+                else:
+                    # This case is for SymInts and other non-Tensor elements
+                    assert not isinstance(val, torch.Tensor)
+                    inputs_fake.append(val)
+        else:
+            # This case is for ints
+            assert isinstance(inp, int)
+            inputs_fake.append(inp)
+    return inputs_fake
+def _check_alias_and_mutation(graph_module, inputs_fake, name, pre_dispatch):
+    aliases, inp_mutation = has_potential_input_alias_or_mutation(
+        graph_module, inputs_fake, pre_dispatch=pre_dispatch
+    )
+    if aliases:
+        raise RuntimeError(
+            f"{name} might be aliasing the input or the output!"
+        )  # noqa: F541
+    if inp_mutation:
+        raise RuntimeError(f"{name} might be modifying the input!")  # noqa: F541
+def unique_graph_id(proxy_mode, prefix):
+    """Returns a unique name and id for a graph to be added to a proxy_mode tracer"""
+    # There are probably better ways - I know that create_arg has some self incrementing name
+    # magic to it, but since we explicitly have to get the name for register_module,
+    # I was not sure how to do that. This kinda simulates it.
+    return unique_graph_name_with_root(proxy_mode.tracer.root, prefix)
+def unique_graph_name_with_root(
+    root: torch.fx.GraphModule, prefix: str
+) -> tuple[int, str]:
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"{prefix}_{i}"
+        if hasattr(root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+    return i, next_name
+def _from_fun(t):
+    from torch._functorch.aot_autograd import from_fun
+    if isinstance(t, torch.Tensor):
+        if t.dtype != torch.bool:
+            return torch.empty_strided(
+                t.size(),
+                t.stride(),
+                dtype=t.dtype,
+                requires_grad=t.requires_grad,
+                device=t.device,
+            )
+        else:
+            # clone of a functional tensor produces a functional tensor
+            # but we want to avoid it so we clone a non-functional version
+            maybe_unfunc_t = t
+            if isinstance(t, FunctionalTensor):
+                torch._sync(t)
+                maybe_unfunc_t = from_fun(t)
+            elif torch._is_functional_tensor(t):
+                # need to handle both types of functionalization here:
+                # these are the tensors that came from the user,
+                # which could be either FunctionalTensorWrapper or FunctionalTensor
+                torch._sync(t)
+                maybe_unfunc_t = torch._from_functional_tensor(t)
+            return maybe_unfunc_t.clone()
+    return t
+def clone_outputs_aliasing_inputs(args):
+    input_storage = {
+        StorageWeakRef(arg._typed_storage())
+        for arg in args
+        if isinstance(arg, torch.Tensor)
+    }
+    def maybe_clone(t):
+        if (
+            isinstance(t, torch.Tensor)
+            and StorageWeakRef(t._typed_storage()) in input_storage
+        ):
+            return t.clone()
+        return t
+    return maybe_clone
+def prepare_fw_with_masks(fn):
+    def fw_with_masks(*args):
+        fw_out = fn(*args)
+        return fw_out, [
+            True if isinstance(ret, torch.Tensor) and ret.requires_grad else False
+            for ret in fw_out
+        ]
+    return fw_with_masks
+def prepare_fw_with_masks_all_requires_grad(fn):
+    def fw_with_masks(*args):
+        fw_out = fn(*args)
+        # Note [force all outputs to be require grad]
+        # Instead of using the original fn, we set the output of original
+        # fn to all require grad. This is consistent with the behavior
+        # of autograd.Function, where if any one of the inputs requires grad
+        # all output will be require grad. This also makes the downstream
+        # require_gradness reasoning much easier.
+        if pytree.tree_any_only(torch.Tensor, lambda t: t.requires_grad, args):
+            fw_out = pytree.tree_map_only(
+                torch.Tensor, lambda x: x.requires_grad_(True), fw_out
+            )
+        return fw_out, pytree.tree_map_only(
+            torch.Tensor, lambda x: x.requires_grad, fw_out
+        )
+    return fw_with_masks
+# This function replaces None gradients with all-zero gradients.
+# `None` gradients are problematic for CUDA graphs. Those gradients are
+# replaced with an all-zero tensor for better optimization
+def unmask_none_gradients(grads, operands):
+    allowed_types = (torch.Tensor, int, torch.SymInt)
+    assert all(
+        isinstance(o, allowed_types) for o in operands
+    ), f"operands can only be of {allowed_types} but got {[type(o) for o in operands]}"
+    unmasked_grads = []
+    for g, o in zip(grads, operands):
+        if g is not None:
+            unmasked_grads.append(g)
+        else:
+            # In case the operand is an int or a torch.SymInt, return None
+            # This can happen for lifted_arguments. E.g., the shapes of a dynamic tensor are lifted and passed
+            # as additional arguments
+            unmasked_grads.append(
+                torch.zeros_like(o) if isinstance(o, torch.Tensor) else None
+            )
+    return unmasked_grads
+def _maybe_fake_prop_ignore_unbacked(fn, args):
+    with ExitStack() as ctx_stack:
+        if (fake_mode := detect_fake_mode(args)) is not None:
+            ctx_stack.enter_context(fake_mode)
+            if fake_mode.shape_env is not None:
+                ctx_stack.enter_context(
+                    fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                )
+        return fn(*args)
+def redirect_to_mode(hop: OperatorBase, mode):
+    """Utility for redispatching HOP to underlying mode
+    Args:
+        hop: The HOP to redispatch
+        mode: The mode to redispatch to
+    Returns:
+        A decorated function that implements the HOP for the given mode
+    """
+    @hop.py_impl(mode)
+    def impl(mode, *args, **kwargs):
+        return mode.__torch_dispatch__(hop, [], args, kwargs)
+    return impl
+# TODO: The parameter use_output_and_grad_bw is required because some operations
+# that utilize this function, such as the while_loop, may require (grad, fwd_outputs)
+def create_fw_bw_graph(fn, use_output_and_grad_bw, fw_inputs, fw_outputs):
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    # Note:[HOP create fw_bw graph] We create "clean" environments for make_fx by suspending all dispatch keys
+    # between Autograd and Python key. Currently, we only suspend functionalization but more can be
+    # added when required. Will encounter two problems if we don't suspend functionalization:
+    #
+    # 1. make_fx fails to capture operations on input: the inputs are wrapped as _to_functional_tensor_wrapper,
+    # but they will be unwrapped before entering ProxyTorchDispatchMode as part of the dispatching.
+    # However, it's the outside wrapper that tracer creates proxies for. This casuses tracer fail to
+    # fetch the proxy for the inputs and fail to capture any operations on them.
+    #
+    # 2. make_fx fails to capture output: the outputs after ProxyTorchDispatchMode are further
+    # wrapped as FunctionalTensorWrapper in Functionalize key after return. However, the tracer
+    # only associates the inner tensor with proxy in ProxyTorchDispatchMode. Therefore,
+    # when creating the output node, it fails to associate the wrapped tensor with its proxy.
+    # Instead, it will create _tensor_constant as output.
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+    example_grad = [_from_fun(out) for out in fw_outputs]
+    num_grads = len(example_grad)
+    fw_graph = _maybe_reenter_make_fx(fn)(*fw_inputs)
+    def joint_fn(*joint_operands_grads):
+        if use_output_and_grad_bw:
+            grads = joint_operands_grads[0]
+            inputs = joint_operands_grads[1][-1:]
+        else:
+            grads = joint_operands_grads[:num_grads]
+            inputs = joint_operands_grads[num_grads:]
+        joint = create_joint(prepare_fw_with_masks(fn), aot_config=dummy_aot_config)
+        _, grads = joint(
+            list(inputs),
+            [grad for grad in grads if grad is not None and grad.requires_grad],
+        )
+        # Unmask None gradients to all-zero gradients
+        unmasked_grads = unmask_none_gradients(grads, inputs)
+        # In order to keep map functional for backward graph,
+        # we clone outputs that are aliasing inputs
+        maybe_clone = clone_outputs_aliasing_inputs(joint_operands_grads)
+        return pytree.tree_map(maybe_clone, unmasked_grads)
+    if use_output_and_grad_bw:
+        example_xs_out = list(fw_inputs) + list(fw_outputs)
+        joint_graph = _maybe_reenter_make_fx(joint_fn)(
+            (list(example_grad), list(example_xs_out))
+        )
+    else:
+        example_xs_out = list(fw_inputs)
+        joint_graph = _maybe_reenter_make_fx(joint_fn)(
+            *(list(example_grad) + list(example_xs_out))
+        )
+    return fw_graph, joint_graph
+def _unstack_pytree(xs):
+    flat_xs, inspec = pytree.tree_flatten(xs)
+    if not all(isinstance(xs, torch.Tensor) for xs in flat_xs):
+        raise RuntimeError(f"Leaves of xs must be Tensor {flat_xs}")
+    if not all(xs.shape[0] == flat_xs[0].shape[0] for xs in flat_xs):
+        raise RuntimeError(
+            f"Leaves of xs must have same leading dimension size {[xs.shape for xs in flat_xs]}"
+        )
+    a = zip(*flat_xs)
+    pytrees = [pytree.tree_unflatten(tuple, inspec) for tuple in a]
+    return pytrees
+def _stack_pytree(pytrees):
+    flat_out = []
+    out_spec = None
+    for pt in pytrees:
+        flat_pt, out_spec = pytree.tree_flatten(pt)
+        flat_out.append(flat_pt)
+    assert out_spec is not None
+    b = zip(*flat_out)
+    stacked_out = []
+    for leaves in b:
+        if all(isinstance(leaf, torch.Tensor) for leaf in leaves):
+            stacked_out.append(torch.stack(leaves))
+        elif all(leaf is None for leaf in leaves):
+            # Backward graph can return None output when forward inputs doesn't require grad.
+            # When we eagerly execute backward graph, we need to call _stack_pytree on its output,
+            # therefore we need to deal with None output.
+            stacked_out.append(None)  # type: ignore[arg-type]
+        else:
+            raise RuntimeError(f"Cannot stack {leaves}.")
+    return pytree.tree_unflatten(stacked_out, out_spec)
+# We cannot call save_for_backward for symints. This helper function
+# can be used to save symints as direct attributes of ctx in autograd.Function.
+#
+# For example, if args = (x, y, s0, z, s1),
+# save_tensors_and_symints_for_backward will partition the args into two lists, and a bookkeeping list pos:
+#   partitioned_args[0] = (x, y, z)
+#   partitioned_args[1] = (s0, s1)
+#   pos = (0, 0, 1, 0, 1)
+# pos list keeps track of which partition the args
+# is partitioned into in order to recover it in saved_tensors_and_symints.
+#
+# In saved_tensors_and_symints, we can recover the original args by:
+# iterating over the pos list and pop one item from the front of paritioned_args[pos[i]].
+# We use t_idx and s_idx to keep track of the next index of the item we are going to pop for the two lists.
+def save_tensors_and_symints_for_backward(ctx, args):
+    assert all(
+        isinstance(arg, (torch.Tensor, torch.SymInt, int, type(None))) for arg in args
+    ), args
+    partitioned_args: list[Any] = [[], []]
+    pos = []
+    for arg in args:
+        idx = 0 if isinstance(arg, torch.Tensor) else 1
+        partitioned_args[idx].append(arg)
+        pos.append(idx)
+    assert not hasattr(ctx, "sym_int_args"), "ctx already has sym_int_args attribute."
+    assert not hasattr(ctx, "pos"), "ctx already has pos attribute."
+    ctx.save_for_backward(*partitioned_args[0])
+    ctx.sym_int_args = partitioned_args[1]
+    ctx.pos = pos
+def saved_tensors_and_symints(ctx):
+    args = []
+    t_idx = 0
+    s_idx = 0
+    saved_tensors = ctx.saved_tensors
+    for p in ctx.pos:
+        if p == 0:
+            args.append(saved_tensors[t_idx])
+            t_idx += 1
+        else:
+            args.append(ctx.sym_int_args[s_idx])
+            s_idx += 1
+    assert t_idx + s_idx == len(ctx.pos)
+    return tuple(args)
+def get_dummy_aot_autograd_config():
+    from torch._functorch.aot_autograd import AOTConfig
+    return AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+# Slices off the first element of a given dimension
+def first_slice_copy(t: torch.Tensor, dim: int = 0) -> torch.Tensor:
+    return torch.select_copy(t, dim, 0)
+# Reports the difference between meta of two tensors in a string
+def diff_tensor_meta(
+    meta1: TensorMetadata, meta2: TensorMetadata, check_grad=True
+) -> list[str]:
+    from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+    pair_diffs = []
+    for meta_name in TensorMetadata._fields:
+        if not check_grad and meta_name == "requires_grad":
+            continue
+        val1 = getattr(meta1, meta_name)
+        val2 = getattr(meta2, meta_name)
+        try:
+            if val1 != val2:
+                pair_diffs.append(f"'{meta_name}: {val1} vs {val2}'")
+        except GuardOnDataDependentSymNode as _:
+            pair_diffs.append(f"'{meta_name}: {val1} vs {val2}'")
+            continue
+    return pair_diffs
+# Note [lifted arg types in hop]
+# For dynamoed hops, we automatically lift the free symbols in tensors as arguments.
+# This has implications for the types of lifted args for different dispatch keys:
+#   1. functionalization, FakeTensorMode, ProxyTorchDispatchMode, Autograd need to support torch.Symint
+#      lifted args because it's on the path of torch.compile(dynamic=True).
+#   2. functionalization, FakeTensorMode, ProxyTorchDispatchMode, Autograd, CompositeExplicitAutograd need
+#      to support int arguments. In the eager run case, we re-trace the subgraph in AutogradKey, so inner
+#      hops may receive int inputs from the shape of outer tensor inputs.
+#      However, CompositeExplicitAutograd won't receive SymInt inputs because it only accepts real tensor inputs.
+def validate_subgraph_args_types(lifted_args: Union[tuple[Any, ...], list[Any]]):
+    allowed_types = (torch.Tensor, int, torch.SymInt)
+    assert all(
+        isinstance(arg, (torch.Tensor, int, torch.SymInt)) for arg in lifted_args
+    ), f"{lifted_args} can only be of {allowed_types} but got {tuple(type(arg) for arg in lifted_args)}"
+# TODO: Return a more detailed information as to which node
+# causes a mutation or an alias. This may requires a per operator tensor version checking
+def check_input_alias_and_mutation(
+    gm: torch.fx.GraphModule,
+    fake_args: list[FakeTensor],
+) -> tuple[dict[int, int], dict[int, int], dict[int, int], list[int]]:
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+        mutated_inputs,
+    ) = check_input_alias_and_mutation_return_outputs(gm, fake_args)[:-1]
+    return inp_inp_alias_map, inp_out_alias_map, out_out_alias_map, mutated_inputs
+def check_input_alias_and_mutation_return_outputs(
+    gm: torch.fx.GraphModule,
+    fake_args: Union[list[FakeTensor], tuple[FakeTensor, ...]],
+) -> tuple[
+    dict[int, int],
+    dict[int, int],
+    dict[int, int],
+    list[int],
+    Union[tuple[Any, ...], list[Any]],
+]:
+    # This function can be called under autograd, functional, proxy and fake tensor mode.
+    # We need to return either a fake tensor or a real tensor depending on the mode.
+    # to detect the input mutation/aliasing.
+    with disable_proxy_modes_tracing(), disable_functional_mode(), suspend_functionalization():
+        def _from_functional_tensor(t: torch.Tensor) -> torch.Tensor:
+            if isinstance(t, FunctionalTensor) or torch._is_functional_tensor(t):
+                return torch.empty_strided(
+                    t.size(),
+                    t.stride(),
+                    dtype=t.dtype,
+                    requires_grad=t.requires_grad,
+                    device=t.device,
+                )
+            return t
+        fake_args = pytree.tree_map_only(
+            torch.Tensor, _from_functional_tensor, fake_args
+        )
+    # We want to disable active functional, proxy and fake modes if any.
+    # to create a encapsulated environment for fake tensor prop
+    with torch.utils._python_dispatch._disable_current_modes():
+        """This function returns mutated inputs, inp-inp alias, inp-out alias, out-out alias
+        in the graph module gm. It checks whether input tensor versions have
+        changed after run gm once to detect mutation and checks tensor storage
+        to detect alias.
+        """
+        def _tensor_version(t) -> Optional[int]:
+            if isinstance(t, torch.Tensor):
+                if not isinstance(t, FakeTensor):
+                    raise RuntimeError("Only fake tensor is allowed")
+                return t._version
+            return None
+        def _tensor_storage(t) -> StorageWeakRef:
+            return StorageWeakRef(t._typed_storage())
+        def _get_shape_env(
+            fake_args,
+        ) -> Optional[torch.fx.experimental.symbolic_shapes.ShapeEnv]:
+            # detect_fake_mode requires there could be only one active fake mode. This
+            # restricts the usage of this function because the global TracingContext
+            # has a persistent fake mode but fake tensors can be created
+            # outside of the tracing context (e.g. in testing).
+            # Instead, we just look at fake_args fake tensor mode
+            if len(fake_args) == 0:
+                return torch.fx.experimental.symbolic_shapes.ShapeEnv()
+            for arg in fake_args:
+                if isinstance(arg, FakeTensor):
+                    return arg.fake_mode.shape_env
+            return None
+        # Clone the fake args to avoid mutating the original fake args
+        with ExitStack() as ctx_stack:
+            # We need to re-use prev_fake_mode's shape env to resolve
+            # the runtime assertions for unbacked symbols.
+            new_fake_mode = torch._subclasses.FakeTensorMode(
+                shape_env=_get_shape_env(fake_args),
+                allow_non_fake_inputs=False,
+            )
+            # We need to temporarily turn inference_mode off because
+            # under inference mode, tensor version counter is not tracked.
+            no_inference_mode_ctx = torch.inference_mode(False)
+            ctx_stack.enter_context(new_fake_mode)
+            ctx_stack.enter_context(no_inference_mode_ctx)
+            if new_fake_mode.shape_env is not None:
+                ctx_stack.enter_context(
+                    new_fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                )
+            # create new fake tensors in new fake mode to avoid mutating original tensors
+            cloned = [
+                torch.empty_strided(
+                    arg.size(),
+                    arg.stride(),
+                    dtype=arg.dtype,
+                    device=arg.device,
+                    requires_grad=arg.requires_grad,
+                    layout=arg.layout,
+                )
+                if isinstance(arg, torch.Tensor)
+                else arg
+                for arg in fake_args
+            ]
+            before = [_tensor_version(arg) for arg in cloned]
+            outputs = gm(*cloned)
+            outputs = [outputs] if not isinstance(outputs, (list, tuple)) else outputs
+            after = [_tensor_version(arg) for arg in cloned]
+            mutated_inputs = [
+                i for i, (v1, v2) in enumerate(zip(before, after)) if v1 != v2
+            ]
+        # We need to analyze the original fake_args to detect
+        # inp-inp alias.
+        inp_storage_map = {
+            _tensor_storage(inp): i
+            for i, inp in enumerate(fake_args)
+            if isinstance(inp, torch.Tensor)
+        }
+        inp_inp_alias_map = {
+            i: inp_storage_map[_tensor_storage(inp)]
+            for i, inp in enumerate(fake_args)
+            if isinstance(inp, torch.Tensor)
+            and inp_storage_map[_tensor_storage(inp)] != i
+        }
+        out_storage_map = {
+            _tensor_storage(out): i
+            for i, out in enumerate(outputs)
+            if isinstance(out, torch.Tensor)
+        }
+        out_out_alias_map = {
+            i: out_storage_map[_tensor_storage(out)]
+            for i, out in enumerate(outputs)
+            if isinstance(out, torch.Tensor)
+            and out_storage_map[_tensor_storage(out)] != i
+        }
+        inp_out_alias_map = {
+            i: out_storage_map[_tensor_storage(inp)]
+            for i, inp in enumerate(cloned)
+            if isinstance(inp, torch.Tensor) and _tensor_storage(inp) in out_storage_map
+        }
+        return (
+            inp_inp_alias_map,
+            inp_out_alias_map,
+            out_out_alias_map,
+            mutated_inputs,
+            outputs,
+        )
+registered_hop_fake_fns: dict[torch._ops.OpOverload, Callable] = {}
+F = TypeVar("F", bound=Callable)
+@overload
+def register_fake(hop, fn: None = None) -> Callable[[F], F]:
+    ...
+@overload
+def register_fake(hop, fn: F) -> F:
+    ...
+def register_fake(hop, fn=None):
+    """
+    Register a fake function for a HOP. This is conceptually equivalent of the
+    register_fake utility for the custom ops. The registered function is called
+    inside the fake_tensor _dispatch_impl.
+    """
+    assert hop not in registered_hop_fake_fns
+    def register(func: F) -> F:
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        redirect_to_mode(hop, FakeTensorMode)
+        registered_hop_fake_fns[hop] = func
+        return func
+    if fn is None:
+        return register
+    return register(fn)
+class FunctionalizeCtxWrapper:
+    """
+    This is a dummy wrapper to facilitate fake tensor caching.
+    For AOT Dispatcher metadata collection pass, HOPs go from functionalization
+    key to fake tensor key. The functionalization key wraps the subgraphs in a
+    function, which changes from call to call even though the subgraph might
+    still be same.
+    To enable fake tensor caching, we just wrap the ctx and subgraph in this
+    class and then use the subgraph as the hash.
+    """
+    # Prevents PYTORCH_TEST_WITH_DYNAMO=1 test failures
+    @torch._disable_dynamo
+    def __init__(self, ctx, subgraph):
+        self.ctx = ctx
+        self.subgraph = subgraph
+    def __hash__(self):
+        return id(self.subgraph)
+    def __repr__(self):
+        return f"FunctionalizeCtxWrapper on subgraph {self.subgraph})"
+    def __call__(self, *args, **kwargs):
+        if isinstance(self.subgraph, torch.fx.GraphModule):
+            # Running graph with interpreter is needed for propagating the stack_trace
+            with fx_traceback.preserve_node_meta():
+                return self.ctx.functionalize(torch.fx.Interpreter(self.subgraph).run)(
+                    *args, **kwargs
+                )
+        return self.ctx.functionalize(self.subgraph)(*args, **kwargs)
+# A wrapper over HigherOrderOperator that also carries its schema
+class HopInstance:
+    def __init__(self, op: HigherOrderOperator, schema: HopSchema):
+        assert isinstance(op, HigherOrderOperator), op
+        self._op = op
+        # Using "_" to be consistent with how we access _schema of OpOverload
+        self._schema = schema
+    def __call__(self, *args, **kwargs):
+        return self._op(*args, **kwargs)
+    @staticmethod
+    def create(hop: HigherOrderOperator, *args, **kwargs):
+        return HopInstance(hop, hop.gen_schema(*args, **kwargs))
+# This call_op can be used to call a HopInstance with
+# flat args and kwargs. We need to make use of the hop's schema's tree_spec
+# to unflatten the args and kwargs before calling the hop.
+def call_op(op: Union[OpOverload, HopInstance], args, kwargs):
+    if isinstance(op, OpOverload):
+        return op(*args, **kwargs)
+    assert isinstance(op, HopInstance), op
+    schema = op._schema
+    bound_args = list(args)
+    bound_kwargs = {}
+    for arg in schema.arguments[len(bound_args) :]:
+        assert arg.name in kwargs, (arg.name, kwargs)
+        val = kwargs[arg.name]
+        if not arg.kwarg_only:
+            bound_args.append(val)
+        else:
+            bound_kwargs[arg.name] = val
+    if schema.tree_spec is not None:
+        assert len(bound_args) == len(schema.arguments) and len(bound_kwargs) == 0
+        args, kwargs = pytree.tree_unflatten(bound_args, schema.tree_spec)
+        return op(*args, **kwargs)
+    else:
+        assert len(bound_args) + len(bound_kwargs) == len(schema.arguments)
+        return op(*bound_args, **bound_kwargs)
+def materialize_as_graph(
+    fn: Callable,
+    args: tuple[Any],
+    include_key_set: Optional[torch._C.DispatchKeySet] = None,
+    exclude_key_set: Optional[torch._C.DispatchKeySet] = None,
+    force_enable_grad=False,
+) -> torch.fx.GraphModule:
+    if include_key_set is None:
+        include_key_set = torch._C._dispatch_tls_local_include_set()
+    if exclude_key_set is None:
+        exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+    @torch._dynamo.disable(recursive=True, reason=None)
+    def _materialize_as_graph_inner():
+        with suspend_functionalization(), disable_functional_mode():
+            with disable_proxy_modes_tracing():
+                unfunc_t = [_from_fun(arg) for arg in args]
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(
+                    torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
+                )
+                if force_enable_grad:
+                    stack.enter_context(torch.enable_grad())
+                return _maybe_reenter_make_fx(fn)(*unfunc_t)
+    gm = _materialize_as_graph_inner()
+    assert gm is not None
+    return gm
+def materialize_callable_in_args(op: HopInstance, args, kwargs):
+    schema = op._schema
+    hop = op._op
+    flat_args, flat_spec = pytree.tree_flatten((args, kwargs))
+    def wrapped_fn(*flat_args):
+        return call_op(op, args, kwargs)
+    # We need to trace the higher order op in order to materilaize the callable inputs that
+    # are a callable (e.g. after functionalization key)
+    gm = reenter_make_fx(wrapped_fn)(*flat_args)
+    hop_node = gm.graph.find_nodes(op="call_function", target=hop)[0]
+    arg_proxies = pytree.tree_leaves((hop_node.args, hop_node.kwargs))
+    assert isinstance(schema, torch._C.FunctionSchema) and len(arg_proxies) == len(
+        schema.arguments
+    )
+    # call_op preserves ordering of proxies via schema
+    materialized_args = []
+    for i, (proxy, arg) in enumerate(zip(arg_proxies, schema.arguments)):
+        if (
+            isinstance(proxy, torch.fx.Node)
+            and proxy.op == "get_attr"
+            and isinstance(getattr(gm, proxy.target), torch.fx.GraphModule)  # type: ignore[arg-type]
+        ):
+            assert callable(flat_args[i]), (schema, args, kwargs)
+            materialized_args.append(getattr(gm, proxy.target))  # type: ignore[arg-type]
+        else:
+            materialized_args.append(flat_args[i])
+    return pytree.tree_unflatten(materialized_args, flat_spec)
+def has_user_subclass(args, allowed_subclasses):
+    """Check if any tensor arguments are user subclasses.
+    This is used to determine if tensor subclasses should get a chance to run
+    their own implementation first before falling back to the default implementation.
+    Args:
+        args: Arguments to check (will be flattened with pytree)
+        allowed_subclasses: Tuple of allowed subclass types
+    Returns:
+        True if user tensor subclasses are found, False otherwise
+    """
+    flat_args, _ = pytree.tree_flatten(args)
+    val = any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, allowed_subclasses)
+        for a in flat_args
+    )
+    return val
+def _has_gen_schema(op: HigherOrderOperator):
+    # There is an InvokeQuant argument we cannot gen_schema.
+    if op is torch.ops.higher_order.invoke_quant_packed:
+        return False
+    method = "gen_schema"
+    return hasattr(type(op), method) and getattr(type(op), method) is not getattr(
+        HigherOrderOperator, method
+    )

archive/.venv/Lib/site-packages/torch/_higher_order_ops/while_loop.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# mypy: allow-untyped-defs
+import contextlib
+from typing import Callable, Union
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    _maybe_run_with_interpreter,
+    _set_compilation_env,
+    autograd_not_implemented,
+    check_meta_consistency,
+    reenter_make_fx,
+    validate_subgraph_args_types,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+class WhileLoopOp(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("while_loop")
+    def __call__(
+        self,
+        cond_fn: Callable,
+        body_fn: Callable,
+        carried_inputs: tuple[Union[torch.Tensor, int, float, bool]],
+        additional_inputs: tuple[Union[torch.Tensor, torch.SymInt, int], ...],
+        /,
+    ):
+        if not isinstance(carried_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"carried_inputs must be a tuple or list, got {type(carried_inputs)}"
+            )
+        if not isinstance(additional_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"additional_inputs must be a tuple or list, got {type(additional_inputs)}"
+            )
+        validate_subgraph_args_types(carried_inputs)
+        validate_subgraph_args_types(additional_inputs)
+        return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
+while_loop_op = WhileLoopOp()
+def while_loop(cond_fn, body_fn, carried_inputs):
+    r"""
+    Run body_fn(*carried_inputs) while cond_fn(*carried_inputs) returns a True scalar tensor. Returns the output of body_fn or
+    initial carried_inputs.
+    .. warning::
+        `torch.while_loop` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+    `while_loop` is a structured control flow operator. It preserves the loop semantic across the torch.compile and torch.export.
+    `while_loop` is equivalent to the following:
+        def while_loop(cond_fn, body_fn, carried_inputs):
+            val = carried_inputs
+            while cond_fn(*val):
+                val = body_fn(*val)
+            return val
+    Args:
+        cond_fn (Callable): A callable function that returns a boolean Scalar tensor or a python boolean.
+        body_fn (Callable): A callable function that takes the same inputs as `cond_fn` and returns a tuple of tensors or ints
+        carried_inputs (Tuple of possibly nested dict/list/tuple of tensors or ints): A tuple of inputs to cond_fn and body_fn.
+            It's also the initial value of states that are carried across iterations. Note that when pass an integer as carry,
+            the corresponding return of while_loop will be another int with unknown values because we don't know how many
+            iterations while_loop will run.
+    Example 1:
+        def cond_fn(iter, x):
+            return iter.sum() < 10
+        def body_fn(iter, x):
+            return iter + 1, x.sin()
+        while_loop(cond_fn, body_fn, (torch.zeros(1), torch.randn(3, 4)))
+    Example 2:
+        def cond_fn(int_iter, x):
+            return 2 * int_iter < x.shape[0]
+        def body_fn(int_iter, x):
+            return int_iter + 1, x + int_iter
+        while_loop(cond,_fn, body_fn, (0, torch.randn(3, 4)))
+    Restrictions:
+        - body_fn must return tensors or int with the same metadata (e.g.shape, dtype) as inputs.
+        - body_fn and cond_fn must not in-place mutate the carried_inputs. A clone before the mutation is required.
+        - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
+        - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
+    .. warning::
+        Temporal Limitations:
+        - 'while_loop' only supports **inference** right now. Autograd will be supported in the future.
+    """
+    from torch._dynamo.backends.debugging import (
+        make_eager_backend_with_torch_function_mode,
+    )
+    # Currently, additional_inputs is not a user-facing input. It will be automatically set in dynamo.
+    # parameters and buffers accessed in cond_fn or body_fn or tensor closures will become additional_inputs.
+    additional_inputs: tuple = ()
+    # The reason we flatten the output before calling into dynamo is that
+    # we want to create a consistent input ordering for cond_fn and body_fn.
+    # and we also want to the input ordering matches the output ordering.
+    # Also see NOTE: [why we cannot use "automatic" for while_loop]
+    # Construct flat cond_fn and flat_body_fn, which takes flattened inputs
+    flat_inputs, in_spec = pytree.tree_flatten((carried_inputs, additional_inputs))
+    def flat_cond_fn(*flat_args):
+        carried, additional = pytree.tree_unflatten(flat_args, in_spec)
+        return cond_fn(*carried, *additional)
+    def flat_body_fn(*flat_args):
+        carried, additional = pytree.tree_unflatten(flat_args, in_spec)
+        return body_fn(*carried, *additional)
+    if torch.compiler.is_dynamo_compiling():
+        return while_loop_op(flat_cond_fn, flat_body_fn, tuple(flat_inputs), tuple())
+    def _validate_input(cond_fn, body_fn, carried_inputs):
+        from torch._higher_order_ops.utils import validate_subgraph_args_types
+        if not callable(cond_fn) or not callable(body_fn):
+            raise RuntimeError("Expect cond_fn and body_fn to be callable.")
+        validate_subgraph_args_types(flat_inputs)
+        if not pytree.tree_all(
+            lambda t: isinstance(t, (torch.Tensor, torch.SymInt, int)), carried_inputs
+        ):
+            raise RuntimeError(
+                "Expect carried_inputs to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor or int leaves, but got {carried_inputs}."
+            )
+    _validate_input(cond_fn, body_fn, carried_inputs)
+    # Dynamo is expecting a callable with "__code__" attribute.
+    # We cannot directly pass cond_op to it. So we wrap it in a dummy function.
+    def _while_loop_op_wrapper(*args, **kwargs):
+        return while_loop_op(*args, **kwargs)
+    with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+        with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                else:
+                    backend = "eager"
+                return torch.compile(
+                    _while_loop_op_wrapper, backend=backend, fullgraph=True
+                )(flat_cond_fn, flat_body_fn, tuple(flat_inputs), tuple())
+@while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def while_loop_dense(cond_fn, body_fn, carried_inputs, additional_inputs):
+    carried_vals = carried_inputs
+    def _validate_cond_output(pred):
+        if (
+            isinstance(pred, torch.Tensor)
+            and pred.size() == torch.Size([])
+            and pred.dtype == torch.bool
+        ) or isinstance(pred, bool):
+            return
+        else:
+            raise RuntimeError(
+                f"cond_fn must return a boolean scalar tensor or a boolean but got {pred}"
+            )
+    if not isinstance(carried_inputs, (tuple, list)):
+        raise RuntimeError(
+            f"carried_inputs must be a tuple or list but got {type(carried_inputs)}"
+        )
+    while pred := cond_fn(*carried_vals, *additional_inputs):
+        _validate_cond_output(pred)
+        out = body_fn(*carried_vals, *additional_inputs)
+        assert isinstance(
+            out, tuple
+        ), f"body_fn should return a tuple but got {type(out)}"
+        assert len(out) == len(
+            carried_inputs
+        ), "body_fn should return the same number of elements as carried_inputs"
+        carried_vals = out
+    return carried_vals
+while_loop_op.py_autograd_impl(
+    autograd_not_implemented(while_loop_op, deferred_error=True)
+)
+def _find_or_create_fake_mode() -> FakeTensorMode:
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+    fake_mode = torch._guards.detect_fake_mode()
+    if fake_mode is None:
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+    return fake_mode
+def _create_unbacked_symint(
+    fake_mode: FakeTensorMode, ignore_fresh_unbacked_symbols: bool
+) -> torch.SymInt:
+    assert (
+        fake_mode is not None and fake_mode.shape_env is not None
+    ), "Must provide a fake_mode with shape_env."
+    ctx = (
+        contextlib.nullcontext()
+        if not ignore_fresh_unbacked_symbols
+        else fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+    )
+    with ctx:
+        return fake_mode.shape_env.create_unbacked_symint()
+@while_loop_op.py_impl(ProxyTorchDispatchMode)
+def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs):
+    def _trace_while_loop(
+        proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+    ):
+        # NOTE [unspecialize int carry with unbacked symints]
+        # When we support int carry, we'll also need to support int output of body_fn because.
+        # previous iteration's output is next iteration's input and they must match.
+        # For carries, when we start tracing while_loop, they can be
+        #   - constants e.g. (0, [1, 3])
+        #   - backed symints (x.shape[0], [x.shape[1] + x.stride[1], x.shape[2]])
+        #   - unbacked symints e.g. (u0, [u0 + u1, u2])
+        #   We choose the most conservative design: in all cases, we create new unbacked symints to trace the
+        #   subgraph. It's possible to do some analysis on initial carry and the output of first
+        #   iteration to determine a better range for the output unbacked symbol e.g. when input is an unbacked
+        #   symint >= 0 before the while_loop but in general this is difficult because we don't know
+        #   the number of iterations. Users would have to re-constrain the unbacked symint in subgraph if needed.
+        #
+        # For output of fake cond_fn, it could be constant bool or SymBool (e.g. return x.shape[0] < 4,
+        #   where x.shape[0] can be either static of dynamic). In the case of constant bool, we should do a
+        #   specialization (NYI).
+        # For output of fake body_fn, it could be all three types though from user's point of view,
+        # they're all integers e.g.
+        #   init_carry = (0, s0, u1, t)
+        #   def body_fn(u0, s0, u1, t):
+        #     ...
+        #     return (t.shape[0], t.shape[1], t.shape[2], y + 1)
+        #
+        #   It may seem that a constant output isn't possible: users shouldn't write a while_loop
+        #   that always return 0. But it could be that a shape is not set as dynamic properly (e.g.
+        #   automatic dynamic hasn't been triggered).
+        #
+        #   For this reason, we treat int, symint outputs in the same way:
+        #   - they can match against any of int, symint carry
+        #   - we unspecialize them with new unbacked symints in fake while_loop
+        #   Similarly, we could do some analysis to refine the output ranges but it's eaiser to start with
+        #   fresh unbacked symints. One suprising case can be: an input unbacked symint is constrained by
+        #   users to be >= 0 (either before while_loop or inside body_fn) and it increments by 1 in each
+        #   iteration. Ideally, we should know that the final output is >= 0 but we didn't constrain the
+        #   unbacked symint output of subgraph as of today because this requires a smart range analysis.
+        fake_mode: FakeTensorMode = _find_or_create_fake_mode()
+        unspecialized_carried_inputs = pytree.tree_map_only(
+            (int, torch.SymInt),
+            # For temporarily created unbacked symints, we don't need to bind them to any proxy
+            lambda _: _create_unbacked_symint(
+                fake_mode, ignore_fresh_unbacked_symbols=True
+            ),
+            carried_inputs,
+        )
+        cond_graph = reenter_make_fx(cond_fn)(
+            *unspecialized_carried_inputs, *additional_inputs
+        )
+        body_graph = reenter_make_fx(body_fn)(
+            *unspecialized_carried_inputs, *additional_inputs
+        )
+        next_name = None
+        i = 0
+        while not next_name:
+            candidate = f"while_loop_cond_graph_{i}"
+            if hasattr(proxy_mode.tracer.root, candidate):
+                i += 1
+            else:
+                next_name = candidate
+        cond_graph_name = next_name
+        body_graph_name = f"while_loop_body_graph_{i}"
+        assert not hasattr(proxy_mode.tracer.root, body_graph_name)
+        proxy_mode.tracer.root.register_module(cond_graph_name, cond_graph)
+        proxy_mode.tracer.root.register_module(body_graph_name, body_graph)
+        args = (cond_graph, body_graph, carried_inputs, additional_inputs)
+        proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+        out_proxy = proxy_mode.tracer.create_proxy(
+            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+        )
+        out = while_loop_op(
+            cond_graph, body_graph, unspecialized_carried_inputs, additional_inputs
+        )
+        return track_tensor_tree(
+            out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        )
+    return _trace_while_loop(
+        mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+    )
+@while_loop_op.py_impl(FakeTensorMode)
+def while_loop_fake_tensor_mode(
+    mode, cond_fn, body_fn, carried_inputs, additional_inputs
+):
+    with mode:
+        # NOTE: [Handling unback symints in subgraph of while_loop]
+        # The idea is that the scope of unbacked symints are limited to the subgraph.
+        #
+        # We're implementing the fake tensor mode of while_loop operator.
+        # and we run body_fn once to get an fake output.
+        # Let's first consider the case that unbacked symints are tensor shapes:
+        #
+        # Case 1:
+        # if the unbacked symints is local to the subgraph e.g.
+        #   def body_fn(it, x):
+        #     nz = x.nonzero()
+        #     return it+1. nz.sum()
+        # we can just ignore the newly created unbacked symints because it has
+        # no effect on the output of while_loop and it's tracked when we tracing.
+        # the subgraph.
+        #
+        # Case 2:
+        # if the unbacked symints are shape of output of while_loop e.g.
+        #   def body_fn(it, x):
+        #     nz = x.nonzero()
+        #     return it+1, nz
+        # This will fail the shape check because in each iteration, the carried_input's shape
+        # must match the output shape as nz.shape contains newly allocated unbacked symint, this
+        # won't match the carried_input's shape.
+        #
+        # Case 3:
+        # if the unbacked symints are shape of carried_inputs e.g.
+        #   nz = a.nonzero()
+        #   body_fn(it, nz):
+        #     return it+1. nz.sin() + 1,
+        # There's no new unbacked symints allocated in subgraph, so we're safe.
+        with mode.shape_env.ignore_fresh_unbacked_symbols():
+            # body_fn return output with the same pytree and tensor meta data as carried_inputs
+            # so we could just return the output after one iteration.
+            body_outs = body_fn(*carried_inputs, *additional_inputs)
+            check_meta_consistency(
+                carried_inputs,
+                body_outs,
+                "carried_inputs",
+                "body_output",
+                include_contiguity=False,
+            )
+        # See NOTE [unspecialize int carry with unbacked symints]
+        return pytree.tree_map_only(
+            (int, torch.SymInt),
+            # For while_loop's unbacked symint output, we want them to be bound
+            # to the proxy of while_loop's output.
+            lambda _: _create_unbacked_symint(
+                mode, ignore_fresh_unbacked_symbols=False
+            ),
+            body_outs,
+        )
+@while_loop_op.py_functionalize_impl
+def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+    unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
+    unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
+    unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
+    with ctx.redispatch_to_next():
+        functional_cond_fn = ctx.functionalize(_maybe_run_with_interpreter(cond_fn))
+        functional_body_fn = ctx.functionalize(_maybe_run_with_interpreter(body_fn))
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for fn, fn_name in [
+            (cond_fn, "cond_fn"),
+            (body_fn, "body_fn"),
+        ]:
+            _check_alias_and_mutation(fn, unwrapped_inputs, fn_name, pre_dispatch)
+        ret = while_loop_op(
+            functional_cond_fn,
+            functional_body_fn,
+            unwrapped_carried_inputs,
+            unwrapped_additional_inputs,
+        )
+        return ctx.wrap_tensors(ret)

archive/.venv/Lib/site-packages/torch/_higher_order_ops/wrap.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# mypy: allow-untyped-defs
+import inspect
+import itertools
+import logging
+from typing import Optional
+from torch._logging import warning_once
+from torch._ops import HigherOrderOperator
+from torch.types import _dtype
+log = logging.getLogger(__name__)
+uid = itertools.count(1)
+# Used for testing the HigherOrderOperator mechanism
+class Wrap(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("wrap")
+    def __call__(self, func, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+        @disable
+        def wrapper():
+            result = func(*args, **kwargs)
+            return result
+        return wrapper()
+wrap = Wrap()
+class WrapWithSetGradEnabled(HigherOrderOperator):
+    def __init__(self) -> None:
+        super().__init__("wrap_with_set_grad_enabled")
+    def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+        @disable
+        def wrapper():
+            with torch.set_grad_enabled(enable_grad):
+                return wrapped_func(*args, **kwargs)
+        return wrapper()
+wrap_with_set_grad_enabled = WrapWithSetGradEnabled()
+class WrapWithAutocast(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("wrap_with_autocast")
+    def __call__(
+        self,
+        device_type: str,
+        dtype: Optional[_dtype],
+        enabled: bool,
+        cache_enabled: Optional[bool],
+        wrapped_func,
+        *args,
+        **kwargs,
+    ):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+        @disable
+        def wrapper():
+            with torch.autocast(device_type, dtype, enabled, cache_enabled):
+                return wrapped_func(*args, **kwargs)
+        return wrapper()
+wrap_with_autocast = WrapWithAutocast()
+# This HOP allows you to bypass dynamo tracing of the wrapper function while
+# still tracing the inner function.
+# Takes two callables: The first, `wrapper_fn`, accepts `inner_fn` and returns a
+# callable with the same signature. The second is the `inner_fn` itself. Any
+# extra *args and **kwargs are forwarded to `wrapper_fn(inner_fn)` when it is
+# executed.
+class DynamoBypassingWrapper(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("dynamo_bypassing_wrapper")
+    def __call__(
+        self,
+        wrapper_fn_or_key,
+        inner_fn,
+        *args,
+        **kwargs,
+    ):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+        is_compiling = isinstance(wrapper_fn_or_key, str)
+        if is_compiling:
+            assert isinstance(inner_fn, torch.fx.GraphModule)
+            wrapper_fn = inner_fn.meta[wrapper_fn_or_key]
+        else:
+            wrapper_fn = wrapper_fn_or_key
+        @disable
+        def wrapper():
+            return wrapper_fn(inner_fn)(*args, **kwargs)
+        return wrapper()
+dynamo_bypassing_wrapper = DynamoBypassingWrapper()
+class WrapActivationCheckpoint(HigherOrderOperator):
+    """
+    This operator is used to wrap torch.utils.checkpoint. This avoids
+    TorchDynamo to look into saved tensor hooks and directly passes the control
+    to AOT Autograd, which is ok with tracing saved tensor hooks. As a result of
+    AOT tracing torch.utils.checkpoint code, we have a backward graph with
+    recomputed forward nodes.
+    However, we might deprecate this operator soon. The difficulty arises in the
+    functionalization of rng ops. Today, there are two different
+    functionalization of rng ops - one at AOT autograd and other at Inductor.
+    And they are difficult to map to each other. The rng states also complicate
+    pattern matching in Inductor. Due to the ease of implementation, we are
+    currently inclined towards functionalization at Inductor level, which means
+    that duplication/recomputation is done as a compiler pass in the
+    partitioners. See TagActivationCheckpoint for more information.
+    """
+    def __init__(self) -> None:
+        super().__init__("wrap_activation_checkpoint", cacheable=False)
+    def __call__(self, function, *args, **kwargs):
+        # use_reentrant is set to False because this op is going to be traced.
+        # And we ensure that AOT Autograd traces through the non reentrant
+        # version of checkpointing.
+        import torch.fx.traceback as fx_traceback
+        from torch.fx import Interpreter
+        kwargs["use_reentrant"] = False
+        kwargs["preserve_rng_state"] = False
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        with fx_traceback.preserve_node_meta():
+            from torch.utils.checkpoint import checkpoint
+            return checkpoint(Interpreter(function).run, *args, **kwargs)
+wrap_activation_checkpoint = WrapActivationCheckpoint()
+class TagActivationCheckpoint(HigherOrderOperator):
+    """
+    This operator is supposed to be used only with torch.compile stack. This
+    accepts a Fx graph module which needs to be checkpointed. This operator adds
+    "recomputable" tag to the nodes of the Fx graph that should be recomputed.
+    The goal is to:
+    1. Avoid using Dynamo to trace through saved tensor hooks.
+    2. For selective checkpointing case, let AOTAutograd trace through
+       saved tensor hooks but has special logic with TorchDispatchMode to override
+       the usual saved_tensor_hooks fn logic in order to tag the nodes.
+    3. Rely on the partitioners to actually duplicate the nodes.
+    This sits well in the torch.compile stack, because by the time graph
+    reaches partitioner, inductor has already run its functionalization of rng
+    ops (by setting fixed seed for each random op, see `replace_random_passes`).
+    Therefore, the duplication of nodes, by design, respects the rng states in
+    the forward and recomputed forward in backward.
+    """
+    def __init__(self) -> None:
+        super().__init__("tag_activation_checkpoint", cacheable=False)
+    @staticmethod
+    def divide_kwargs(kwargs):
+        """
+        checkpoint fn can have mixed kwargs between checkpointed fn and
+        checkpoint fn itself. For example
+        >> def gn(x, y, z=None):
+        >>     a = torch.matmul(x, y)
+        >>     if z is not None:
+        >>         return torch.matmul(a, z)
+        >>     return a
+        >> def fn(x, y, z):
+        >>     return torch.cos(checkpoint(gn, x, y, use_reentrant=False, z=z))
+        In the above case, z belongs to checkpointed function gn, but
+        use_reentrant belongs to the checkpoint function. This function splits
+        the kwargs into checkpoint_kwargs and gmod_kwargs (or
+        checkpointed_fn_kwargs).
+        We do sorting to ensure same graph from run to run for better
+        debuggability. It is not required for correctness.
+        """
+        from torch.utils.checkpoint import checkpoint
+        ckpt_signature = inspect.signature(checkpoint)
+        checkpoint_keys = set()
+        for name in ckpt_signature.parameters:
+            if name in ("function", "args", "kwargs"):
+                continue
+            checkpoint_keys.add(name)
+        # `preserve_rng_state` is not a regular kwarg
+        checkpoint_keys.add("preserve_rng_state")
+        checkpoint_kwargs = {
+            name: kwargs[name] for name in kwargs.keys() if name in checkpoint_keys
+        }
+        gmod_kwargs = {
+            name: kwargs[name] for name in kwargs.keys() if name not in checkpoint_keys
+        }
+        return checkpoint_kwargs, gmod_kwargs
+    def tag_nodes(self, gmod, is_sac):
+        from torch.utils.checkpoint import CheckpointPolicy
+        unique_graph_id = next(uid)
+        for node in gmod.graph.nodes:
+            if node.op in ("call_function", "call_method", "call_module"):
+                node.meta["ac_graph_id"] = unique_graph_id
+                if is_sac:
+                    # For selective checkpointing, we will populate this tag later in _CachingTorchDispatchMode.
+                    node.meta["recompute"] = None
+                else:
+                    # Under vanilla activation checkpointing, all nodes should be recomputed.
+                    node.meta["recompute"] = CheckpointPolicy.PREFER_RECOMPUTE
+        return gmod
+    def __call__(self, gmod, *args, **kwargs):
+        import torch.fx.traceback as fx_traceback
+        from torch.fx import Interpreter
+        if "_checkpoint_context_fn" in gmod.meta:
+            warning_once(
+                log,
+                """
+Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
+Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
+""",
+            )
+            # use_reentrant is set to False because this op is going to be traced.
+            # And we ensure that AOT Autograd traces through the non reentrant
+            # version of checkpointing.
+            kwargs["use_reentrant"] = False
+            # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
+            # `torch.random.fork_rng` op (which is not supported yet under CUDA).
+            # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
+            # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
+            # instead of in AOTAutograd).
+            kwargs["preserve_rng_state"] = False
+            kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
+            # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
+            # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
+            gmod = self.tag_nodes(gmod, is_sac=True)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            with fx_traceback.preserve_node_meta():
+                from torch.utils.checkpoint import checkpoint
+                return checkpoint(Interpreter(gmod).run, *args, **kwargs)
+        else:
+            gmod = self.tag_nodes(gmod, is_sac=False)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
+            # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
+            # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
+            with fx_traceback.preserve_node_meta():
+                return Interpreter(gmod).run(*args)
+tag_activation_checkpoint = TagActivationCheckpoint()