koichi12 commited on Feb 12, 2025

Commit

44b4c93

verified ·

1 Parent(s): 70fbf20

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py +1 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py +192 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py +679 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py +1843 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py +570 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py +776 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py +466 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py +248 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py +311 -0
.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py +87 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -142,3 +142,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/simd.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/halide.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/simd.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/halide.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee19b8e0b980d0895a7af50aa7c3244d133ce110a196485ab8cec5fa7b9767d4
+size 121452

.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8596ce3d305b9ea76fd93737e3fda25769b1901142db9efff0fde9757b03517
+size 262897

.venv/lib/python3.11/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de158b207dec0ef6dd7cca5acc1db68fcc605b0046ed6c5ffcf0d9b8f34d3b82
+size 138985

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (198 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/b2b_gemm.cpython-311.pyc ADDED Viewed

Binary file (26.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-311.pyc ADDED Viewed

Binary file (8.35 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-311.pyc ADDED Viewed

Binary file (5.27 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-311.pyc ADDED Viewed

Binary file (14.5 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-311.pyc ADDED Viewed

Binary file (41.9 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-311.pyc ADDED Viewed

Binary file (75.8 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-311.pyc ADDED Viewed

Binary file (34.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/micro_pipeline_tp.cpython-311.pyc ADDED Viewed

Binary file (42 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-311.pyc ADDED Viewed

Binary file (6.81 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-311.pyc ADDED Viewed

Binary file (63.5 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-311.pyc ADDED Viewed

Binary file (37.4 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-311.pyc ADDED Viewed

Binary file (59.3 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-311.pyc ADDED Viewed

Binary file (40.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-311.pyc ADDED Viewed

Binary file (84.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-311.pyc ADDED Viewed

Binary file (31.8 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-311.pyc ADDED Viewed

Binary file (7.48 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (321 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc ADDED Viewed

Binary file (9.36 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-311.pyc ADDED Viewed

Binary file (25.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_attention.cpython-311.pyc ADDED Viewed

Binary file (67.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/flex_decoding.cpython-311.pyc ADDED Viewed

Binary file (21.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-311.pyc ADDED Viewed

Binary file (30.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-311.pyc ADDED Viewed

Binary file (20.7 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-311.pyc ADDED Viewed

Binary file (7.92 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/mm_scaled.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-311.pyc ADDED Viewed

Binary file (3.59 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/bmm.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# mypy: allow-untyped-defs
+import logging
+import torch
+from .. import ir, lowering as L
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    ceildiv as cdiv,
+    use_aten_gemm_kernels,
+    use_cutlass_template,
+    use_triton_template,
+)
+from ..virtualized import V
+from .mm import _is_static_problem
+from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_options
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+def bmm_grid(b, m, n, meta):
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
+bmm_template = TritonTemplate(
+    name="bmm",
+    grid=bmm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", -2)}}
+    N = {{size("B", -1)}}
+    K = {{size("A", -1)}}
+    stride_aq = {{stride("A", 0)}}
+    stride_am = {{stride("A", 1)}}
+    stride_ak = {{stride("A", 2)}}
+    stride_bq = {{stride("B", 0)}}
+    stride_bk = {{stride("B", 1)}}
+    stride_bn = {{stride("B", 2)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+    rk = tl.arange(0, BLOCK_K)
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")
+@L.register_lowering(aten.bmm)
+def tuned_bmm(mat1, mat2, *, layout=None):
+    if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
+        # decompose to small ops when memory bound
+        if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
+            mat1 = L.unsqueeze(mat1, -1)
+            mat2 = L.unsqueeze(mat2, 1)
+            return L.sum_(L.mul(mat1, mat2), axis=2)
+        def is_valid_to_require_contiguous(t):
+            if not ir.is_storage_and_layout(t):
+                return True
+            _, layout = ir.as_storage_and_layout(t, freeze=False)
+            return isinstance(layout, ir.FlexibleLayout)
+        def is_preferred_layout_as_bmm_input(sizes, strides):
+            # contiguous on one of the last two dims
+            return (
+                strides[-1] == 1 and (sizes[-2] == 1 or strides[-2] >= sizes[-1])
+            ) or (strides[-2] == 1 and (sizes[-1] == 1 or strides[-1] >= sizes[-2]))
+        # Make the input of bmm contiguous
+        # if it is not contiguous on either of the last two dims,
+        # because bmm cpu implementation would do contiguous() if not.
+        # This is to avoid additional copies in bmm.
+        def may_require_contiguous(t, meta_t):
+            sizes = meta_t.meta["val"].size()
+            strides = meta_t.meta["val"].stride()
+            if not is_preferred_layout_as_bmm_input(sizes, strides):
+                t = ir.ExternKernel.require_contiguous(t)
+            return t
+        if is_valid_to_require_contiguous(mat1):
+            meta_mat1 = V.graph.current_node.args[0]
+            mat1 = may_require_contiguous(mat1, meta_mat1)
+        if is_valid_to_require_contiguous(mat2):
+            meta_mat2 = V.graph.current_node.args[1]
+            mat2 = may_require_contiguous(mat2, meta_mat2)
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    # options to tune from
+    choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        choices.append(aten_bmm.bind((mat1, mat2), layout))
+    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
+# Don't register this since it is slower than decomposing it
+# @L.register_lowering(aten.baddbmm)
+def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
+    # options to tune from
+    choices = (
+        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/conv.py ADDED Viewed

	@@ -0,0 +1,679 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from __future__ import annotations
+import functools
+import logging
+from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict
+import torch
+from .. import config, ir
+from ..lowering import (
+    add_layout_constraint,
+    constrain_to_fx_strides,
+    lowerings as L,
+    register_lowering,
+)
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    ceildiv,
+    is_ones,
+    is_zeros,
+    pad_listlike,
+    sympy_product,
+    use_triton_template,
+)
+from ..virtualized import V
+from .mm_common import filtered_configs
+if TYPE_CHECKING:
+    from ..ir import TensorBox
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+def conv2d_grid(n, c, h, w, meta):
+    return (
+        ceildiv(n * h * w, meta["BLOCK_M"]),
+        ceildiv(c, meta["BLOCK_N"]),
+        meta["GROUPS"],
+    )
+def conv3d_grid(n, c, d, h, w, meta):
+    return (
+        ceildiv(n * d * h * w, meta["BLOCK_M"]),
+        ceildiv(c, meta["BLOCK_N"]),
+        meta["GROUPS"],
+    )
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform
+kernel_configs = [
+    # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+    {"config": (64, 256, 16, 2, 4), "cond": True},
+    {"config": (256, 64, 16, 2, 4), "cond": True},
+    {"config": (1024, 16, 16, 1, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 256, 32, 2, 8), "cond": True},
+    {"config": (256, 64, 32, 2, 8), "cond": True},
+]
+# Create filtered list of configs based on conv
+platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in kernel_configs
+    if config["cond"]
+)
+# On ROCm convert num_stages to 1 as pipelining provides no benefit
+if torch.version.hip:
+    platform_configs = tuple(
+        (config[0], config[1], config[2], 1, config[4]) for config in platform_configs
+    )
+conv_configs = functools.partial(
+    filtered_configs,
+    configs=platform_configs,
+)
+LOOP_BODY_2D = """
+        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
+        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
+        idx_x_c = tl.arange(0, BLOCK_K) + k
+        x_ptrs = x_base + (
+            (idx_x_h * stride_xh)[:, None]
+            + (idx_x_w * stride_xw)[:, None]
+            + (idx_x_c * stride_xc)[None, :]
+        )
+        mask_x = (
+            (idx_n < BATCH)[:, None]
+            & (idx_x_h >= 0)[:, None]
+            & (idx_x_h < IN_H)[:, None]
+            & (idx_x_w >= 0)[:, None]
+            & (idx_x_w < IN_W)[:, None]
+            & (idx_x_c < GROUP_IN_C)[None, :]
+        )
+        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
+        w_ptrs = w_base + (
+            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
+        )
+        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
+        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
+"""
+"""
+This is a relatively simple conv implementation that can likely be
+improved.  Many alternate conv versions can be found here:
+https://github.com/pytorch/torchdynamo/pull/971
+"""
+conv2d_template = TritonTemplate(
+    name="convolution2d",
+    grid=conv2d_grid,
+    source=r"""
+{{def_kernel("X", "W")}}
+    # Tensor dimensions
+    BATCH = {{size("X", 0)}}
+    IN_C = {{size("X", 1)}}
+    IN_H = {{size("X", 2)}}
+    IN_W = {{size("X", 3)}}
+    OUT_C = {{size(None, 1)}}
+    OUT_H = {{size(None, 2)}}
+    OUT_W = {{size(None, 3)}}
+    # Strides:
+    stride_xn = {{stride("X", 0)}}
+    stride_xc = {{stride("X", 1)}}
+    stride_xh = {{stride("X", 2)}}
+    stride_xw = {{stride("X", 3)}}
+    stride_wc_out = {{stride("W", 0)}}
+    stride_wc_in = {{stride("W", 1)}}
+    stride_wh = {{stride("W", 2)}}
+    stride_ww = {{stride("W", 3)}}
+    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    idx_y_w = nhw % OUT_W
+    nh = nhw // OUT_W
+    idx_y_h = nh % OUT_H
+    idx_n = nh // OUT_H
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+{% if GROUPS == 1 %}
+    group = 0
+    GROUP_IN_C = IN_C
+    GROUP_OUT_C = OUT_C
+{% else %}
+    group = tl.program_id(2)
+    GROUP_IN_C = IN_C // GROUPS
+    GROUP_OUT_C = OUT_C // GROUPS
+{% endif %}
+    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
+    w_base = (
+        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
+    )
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+{% if UNROLL %}
+{% for i in range(KERNEL_H) %}
+{% for j in range(KERNEL_W) %}
+    i = {{i}}
+    j = {{j}}
+    for k in range(0, GROUP_IN_C, BLOCK_K):
+        """
+    + LOOP_BODY_2D
+    + """
+{% endfor %}
+{% endfor %}
+{% else %}
+    # Could be simplified, but slightly slower:
+    # for i in range(KERNEL_H):
+    #     for j in range(KERNEL_W):
+    #         for k in range(0, GROUP_IN_C, BLOCK_K):
+    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
+    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
+        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
+        ij = ijk // BLOCK_K_COUNT
+        i = ij // KERNEL_W
+        j = ij % KERNEL_W
+        """
+    + LOOP_BODY_2D
+    + """
+{% endif %}
+    mask = (
+        (idx_n < BATCH)[:, None]
+        & (idx_y_h < OUT_H)[:, None]
+        & (idx_y_w < OUT_W)[:, None]
+        & (idx_y_c < GROUP_OUT_C)[None, :]
+    )
+    idx_n = idx_n[:, None]
+    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
+    idx_h = idx_y_h[:, None]
+    idx_w = idx_y_w[:, None]
+    # inductor generates a suffix
+    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
+""",
+)
+LOOP_BODY_3D = """
+        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
+        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
+        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
+        idx_x_c = tl.arange(0, BLOCK_K) + k
+        x_ptrs = x_base + (
+            (idx_x_d * stride_xd)[:, None]
+            + (idx_x_h * stride_xh)[:, None]
+            + (idx_x_w * stride_xw)[:, None]
+            + (idx_x_c * stride_xc)[None, :]
+        )
+        mask_x = (
+            (idx_n < BATCH)[:, None]
+            & (idx_x_d >= 0)[:, None]
+            & (idx_x_d < IN_D)[:, None]
+            & (idx_x_h >= 0)[:, None]
+            & (idx_x_h < IN_H)[:, None]
+            & (idx_x_w >= 0)[:, None]
+            & (idx_x_w < IN_W)[:, None]
+            & (idx_x_c < GROUP_IN_C)[None, :]
+        )
+        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
+        w_ptrs = w_base + (
+            (idx_x_c * stride_wc_in)[:, None] +
+            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
+        )
+        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
+        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
+"""
+conv3d_template = TritonTemplate(
+    name="convolution3d",
+    grid=conv3d_grid,
+    source=r"""
+{{def_kernel("X", "W")}}
+    # Tensor dimensions
+    BATCH = {{size("X", 0)}}
+    IN_C = {{size("X", 1)}}
+    IN_D = {{size("X", 2)}}
+    IN_H = {{size("X", 3)}}
+    IN_W = {{size("X", 4)}}
+    OUT_C = {{size(None, 1)}}
+    OUT_D = {{size(None, 2)}}
+    OUT_H = {{size(None, 3)}}
+    OUT_W = {{size(None, 4)}}
+    # Strides:
+    stride_xn = {{stride("X", 0)}}
+    stride_xc = {{stride("X", 1)}}
+    stride_xd = {{stride("X", 2)}}
+    stride_xh = {{stride("X", 3)}}
+    stride_xw = {{stride("X", 4)}}
+    stride_wc_out = {{stride("W", 0)}}
+    stride_wc_in = {{stride("W", 1)}}
+    stride_wd = {{stride("W", 2)}}
+    stride_wh = {{stride("W", 3)}}
+    stride_ww = {{stride("W", 4)}}
+    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    idx_y_w = ndhw % OUT_W
+    ndh = ndhw // OUT_W
+    idx_y_h = ndh % OUT_H
+    nd = ndh // OUT_H
+    idx_y_d = nd % OUT_D
+    idx_n = nd // OUT_D
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+{% if GROUPS == 1 %}
+    group = 0
+    GROUP_IN_C = IN_C
+    GROUP_OUT_C = OUT_C
+{% else %}
+    group = tl.program_id(2)
+    GROUP_IN_C = IN_C // GROUPS
+    GROUP_OUT_C = OUT_C // GROUPS
+{% endif %}
+    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
+    w_base = (
+        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
+    )
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+{% if UNROLL %}
+{% for d in range(KERNEL_D) %}
+{% for i in range(KERNEL_H) %}
+{% for j in range(KERNEL_W) %}
+    d = {{d}}
+    i = {{i}}
+    j = {{j}}
+    for k in range(0, GROUP_IN_C, BLOCK_K):
+        """
+    + LOOP_BODY_3D
+    + """
+{% endfor %}
+{% endfor %}
+{% endfor %}
+{% else %}
+    # Could be simplified, but slightly slower:
+    # for d in range(KERNEL_D):
+    #   for i in range(KERNEL_H):
+    #     for j in range(KERNEL_W):
+    #         for k in range(0, GROUP_IN_C, BLOCK_K):
+    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
+    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
+        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
+        dij = dijk // BLOCK_K_COUNT
+        j = dij % KERNEL_W
+        di = dij // KERNEL_W
+        i = di % KERNEL_H
+        d = di // KERNEL_H
+        """
+    + LOOP_BODY_3D
+    + """
+{% endif %}
+    mask = (
+        (idx_n < BATCH)[:, None]
+        & (idx_y_d < OUT_D)[:, None]
+        & (idx_y_h < OUT_H)[:, None]
+        & (idx_y_w < OUT_W)[:, None]
+        & (idx_y_c < GROUP_OUT_C)[None, :]
+    )
+    idx_n = idx_n[:, None]
+    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
+    idx_d = idx_y_d[:, None]
+    idx_h = idx_y_h[:, None]
+    idx_w = idx_y_w[:, None]
+    # inductor generates a suffix
+    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
+""",
+)
+aten_convolution = ExternKernelChoice(
+    torch.convolution,
+    "at::convolution",
+    has_out_variant=False,
+    op_overload=aten.convolution.default,
+)
+def conv1x1_via_mm(x, w, *, out):
+    w = torch.squeeze(torch.squeeze(w, -1), -1)
+    return torch.matmul(
+        x.permute(0, 2, 3, 1), w.permute(1, 0), out=out.permute(0, 2, 3, 1)
+    )
+aten_conv1x1_via_mm = ExternKernelChoice(conv1x1_via_mm, None)
+class ConvLayoutParams(TypedDict):
+    stride: tuple[int, ...]
+    padding: tuple[int, ...]
+    dilation: tuple[int, ...]
+    transposed: bool
+    output_padding: tuple[int, ...]
+    groups: int
+def conv_layout(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: Optional[TensorBox],
+    stride: Sequence[int],
+    padding: tuple[int, ...],
+    dilation: tuple[int, ...],
+    transposed: bool,
+    output_padding: tuple[int, ...],
+    groups: int,
+) -> ir.Layout:
+    """Determine output layout for a convolution"""
+    with V.graph.fake_mode:
+        output = torch.ops.aten.convolution(
+            ir.ir_node_to_tensor(x, guard_shape=True),
+            ir.ir_node_to_tensor(weight, guard_shape=True),
+            ir.ir_node_to_tensor(bias, guard_shape=True),
+            V.graph.sizevars.size_hints(stride),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(padding),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(dilation),  # type: ignore[arg-type]
+            transposed,
+            V.graph.sizevars.size_hints(output_padding),  # type: ignore[arg-type]
+            groups,
+        )
+        sizes = ir.convert_shape_to_inductor(output.size())
+        stride = ir.convert_shape_to_inductor(output.stride())  # type: ignore[assignment]
+    return ir.FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        sizes,
+        stride,
+    )
+def channels_last_order(rank):
+    order = list(reversed(range(rank)))
+    order.insert(1, order.pop(-1))
+    return order
+def convert_1x1_conv_to_mm(x, weight, bias):
+    # special case for 1x1 convolution, which is actually just a matmul
+    rank = len(weight.get_size())
+    for _ in range(rank - 2):
+        weight = L[aten.squeeze](weight, dim=-1)
+    weight = L[aten.permute](weight, [1, 0])
+    x = ir.ExternKernel.require_stride_order(x, channels_last_order(rank))
+    x_permute = list(range(rank))
+    x_permute.append(x_permute.pop(1))
+    x = L[aten.permute](x, x_permute)
+    *sizes, in_chan = x.get_size()
+    x = L[aten.reshape](x, [sympy_product(sizes), in_chan])
+    if bias is None:
+        result = L[aten.mm](x, weight)
+    else:
+        result = L[aten.addmm](bias, x, weight)
+    result = L[aten.reshape](result, [*sizes, -1])
+    result_permute = list(range(rank))
+    result_permute.insert(1, result_permute.pop(-1))
+    return L[aten.permute](result, result_permute)
+@register_lowering(aten.convolution)
+def convolution(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: TensorBox,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+    stride = tuple(stride)
+    padding = tuple(padding)
+    dilation = tuple(dilation)
+    output_padding = tuple(output_padding)
+    if not isinstance(groups, int):
+        groups = V.graph.sizevars.evaluate_static_shape(groups)
+    assert isinstance(groups, int)
+    # Need use hint for triton template since the template does not
+    # work with a dynamic shape.
+    #
+    # No need to evaluate_static_shape for dilation and output_padding
+    # since the template is only used when dilation is 1 and output_padding
+    # is 0.
+    stride = tuple(V.graph.sizevars.evaluate_static_shapes(stride))
+    padding = tuple(V.graph.sizevars.evaluate_static_shapes(padding))
+    kwargs: ConvLayoutParams = {
+        "stride": stride,
+        "padding": padding,
+        "dilation": dilation,
+        "transposed": transposed,
+        "output_padding": output_padding,
+        "groups": groups,
+    }
+    if len(x.get_size()) == len(weight.get_size()) - 1:
+        # add batch dimension to simplify rest of function
+        return L[aten.squeeze](
+            convolution(L[aten.expand](x, [1, *x.get_size()]), weight, bias, **kwargs),
+            dim=0,
+        )
+    out_chan, in_chan, *kernel_shape = V.graph.sizevars.evaluate_static_shapes(
+        weight.get_size()
+    )
+    ndim = len(kernel_shape)
+    stride = pad_listlike(stride, ndim)
+    padding = pad_listlike(padding, ndim)
+    dilation = pad_listlike(dilation, ndim)
+    output_padding = pad_listlike(output_padding, ndim)
+    def channels_last_conv():
+        if V.graph.layout_opt and ndim == 2:
+            return True
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        return req_stride_order == ir.NHWC_STRIDE_ORDER
+    autotuning_gemm = config.max_autotune or config.max_autotune_gemm
+    if (
+        (config.conv_1x1_as_mm or (autotuning_gemm and channels_last_conv()))
+        and is_ones(kernel_shape)
+        and is_ones(stride)
+        and is_zeros(padding)
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        and groups == 1
+        and V.graph.sizevars.statically_known_gt(sympy_product(x.get_size()), 0)
+    ):
+        return convert_1x1_conv_to_mm(x, weight, bias)
+    if bias is not None and ir.get_device_type(x) != "cpu":
+        # peel off the bias, cudnn is slower with it
+        result = convolution(x, weight, None, **kwargs)
+        return L[aten.add](
+            result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1])
+        )
+    x.realize()
+    weight.realize()
+    # ndim can be 1 for convolution in models such as demucs
+    # TODO: check if it's beneficial to convert Conv1d to Conv2d and then
+    # apply channels last.
+    if V.graph.layout_opt and ndim == 2:
+        V.graph.num_channels_last_conv += 1
+        x = ir.ExternKernel.require_channels_last(x)
+        # TODO maybe we can convert weights to channels last just once before
+        # running the model.
+        weight = ir.ExternKernel.require_channels_last(weight)
+        layout = conv_layout(x, weight, None, **kwargs)
+    else:
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        x = ir.ExternKernel.require_stride_order(x, req_stride_order)
+        weight = ir.ExternKernel.require_stride_order(weight, req_stride_order)
+    ordered_kwargs_for_cpp_kernel = [
+        "stride",
+        "padding",
+        "dilation",
+        "transposed",
+        "output_padding",
+        "groups",
+    ]
+    if bias is None:
+        args = [x, weight]
+        kwargs["bias"] = None  # type: ignore[typeddict-unknown-key]
+        ordered_kwargs_for_cpp_kernel.insert(0, "bias")
+    else:
+        args = [x, weight, bias]
+        bias.realize()
+        bias.freeze_layout()
+        V.graph.sizevars.evaluate_static_shapes(bias.get_size())
+    choices = []
+    if torch._inductor.utils._use_conv_autotune_backend("ATEN"):
+        choices = [
+            aten_convolution.bind(
+                args,
+                layout,
+                ordered_kwargs_for_cpp_kernel,
+                **kwargs,
+            )
+        ]
+    if (
+        torch._inductor.utils._use_conv_autotune_backend("TRITON")
+        and use_triton_template(layout)
+        # templates only support these:
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
+        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
+    ):
+        if (
+            is_ones(kernel_shape)
+            and is_ones(stride)
+            and is_zeros(padding)
+            and groups == 1
+        ):
+            choices.append(aten_conv1x1_via_mm.bind(args, layout))
+        for cfg in conv_configs(
+            sympy_product([x.get_size()[0], *x.get_size()[2:]]),
+            out_chan,
+            in_chan,
+        ):
+            if ndim == 2:
+                conv2d_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(x, weight),
+                    layout=layout,
+                    KERNEL_H=kernel_shape[0],
+                    KERNEL_W=kernel_shape[1],
+                    STRIDE_H=stride[0],
+                    STRIDE_W=stride[1],
+                    PADDING_H=padding[0],
+                    PADDING_W=padding[1],
+                    GROUPS=groups,
+                    # TODO(jansel): try unroll for bigger kernels once fixed:
+                    #               https://github.com/openai/triton/issues/1254
+                    UNROLL=is_ones(kernel_shape),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
+                    num_stages=cfg.num_stages,
+                    num_warps=cfg.num_warps,
+                    **cfg.kwargs,
+                )
+            elif ndim == 3:
+                conv3d_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(x, weight),
+                    layout=layout,
+                    KERNEL_D=kernel_shape[0],
+                    KERNEL_H=kernel_shape[1],
+                    KERNEL_W=kernel_shape[2],
+                    STRIDE_D=stride[0],
+                    STRIDE_H=stride[1],
+                    STRIDE_W=stride[2],
+                    PADDING_D=padding[0],
+                    PADDING_H=padding[1],
+                    PADDING_W=padding[2],
+                    GROUPS=groups,
+                    # TODO(jansel): try unroll for bigger kernels once fixed:
+                    #               https://github.com/openai/triton/issues/1254
+                    UNROLL=is_ones(kernel_shape),
+                    ALLOW_TF32=torch.backends.cudnn.allow_tf32,
+                    num_stages=cfg.num_stages,
+                    num_warps=cfg.num_warps,
+                    **cfg.kwargs,
+                )
+    return autotune_select_algorithm("convolution", choices, args, layout)
+@register_lowering(aten._convolution)
+def _convolution(
+    x,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32,
+):
+    return convolution(
+        x, weight, bias, stride, padding, dilation, transposed, output_padding, groups
+    )
+def constrain_conv_to_fx_strides(fx_node, *args, **kwargs):
+    assert fx_node.target == torch.ops.aten.convolution.default
+    if V.graph.layout_opt:
+        return args, kwargs
+    else:
+        return constrain_to_fx_strides(fx_node, *args, **kwargs)
+add_layout_constraint(aten.convolution, constrain_conv_to_fx_strides)

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_attention.py ADDED Viewed

	@@ -0,0 +1,1843 @@

+# mypy: allow-untyped-defs
+""" Triton Implementation of the flex_attention Kernel"""
+import logging
+import math
+from typing import Any, List, Optional, Sequence, Tuple
+import sympy
+import torch
+from torch._inductor.virtualized import V
+from torch.utils._pytree import tree_map
+from .. import config
+from ..ir import (
+    ComputedBuffer,
+    ExternKernel,
+    FixedLayout,
+    FlexibleLayout,
+    get_stride_order,
+    InputBuffer,
+    IRNode,
+    StorageBox,
+    stride_order2fill_order,
+    Subgraph,
+    TensorBox,
+)
+from ..lowering import empty, empty_strided, lowerings, register_lowering
+from ..select_algorithm import autotune_select_algorithm, realize_inputs, TritonTemplate
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+Expr = sympy.Expr
+def construct_strides(
+    sizes: Sequence[int],
+    fill_order: Sequence[int],
+) -> Sequence[int]:
+    """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
+    # Initialize strides
+    assert len(sizes) == len(
+        fill_order
+    ), "Length of sizes must match the length of the fill order"
+    strides = [0] * len(sizes)
+    # Start with stride 1 for the innermost dimension
+    current_stride = 1
+    # Iterate through the fill order populating strides
+    for dim in fill_order:
+        strides[dim] = current_stride
+        current_stride *= sizes[dim]
+    return strides
+def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the final attention output.
+    """
+    import triton
+    return (triton.cdiv(num_queries, meta["BLOCK_M"]), batch_size * q_heads, 1)
+def create_placeholder(
+    name: str, dtype: torch.dtype, device: torch.device
+) -> TensorBox:
+    """Creates a placeholder input buffers for producing subgraph_output."""
+    input_buffer = InputBuffer(name, FixedLayout(device, dtype, [], []))
+    return TensorBox.create(input_buffer)
+def maybe_realize(args: List[Optional[IRNode]]):
+    """Accepts a list of optional IRNodes and returns a list of realized IRNodes"""
+    return tree_map(lambda x: realize_inputs(x) if x is not None else None, args)
+def get_float32_precision():
+    if torch.get_float32_matmul_precision() == "highest" or torch.version.hip:
+        return "'ieee'"
+    else:
+        return "'tf32'"
+def build_subgraph_buffer(
+    args: List[TensorBox],
+    subgraph: Subgraph,
+):
+    """This function's goal is to take in the required args and produce the subgraph buffer
+    The subgraph buffer is a ComputedBuffer that will be inlined into the triton template
+    Args:
+        args: The args that are passed into the subgraph. Contains both fixed and lifted inputs.
+        subgraph: The Subgraph ir for which to produce the output node
+    """
+    cnt = 0
+    env = {}
+    for node in subgraph.graph_module.graph.nodes:
+        # There are two classes of placeholder inpts that we need
+        # to handle differently. For the first n_scalar_inps inputs
+        # we expect that these placeholders were generated by the make_fx call
+        # in the flex Attention HOP. So we need to create a new placeholder
+        # TensorBox for each of these inputs. For the rest of the inputs we
+        # expect that these are lifted inputs that fill up the '*other_buffers'
+        # tuple and already have corresponding TensorBoxes passed in as args.
+        if node.op == "placeholder":
+            env[node] = args[cnt]
+            cnt += 1
+        elif node.op == "call_function":
+            # For call_function we use the default lowerings and pass in the
+            # already created TensorBoxes as args
+            args, kwargs = tree_map(
+                lambda x: env[x] if x in env else x, (node.args, node.kwargs)
+            )
+            env[node] = lowerings[node.target](*args, **kwargs)
+        elif node.op == "output":
+            def convert_output_node_to_buffer(output):
+                if output is None:
+                    return None
+                output_node = output
+                output_buffer = env[output_node]
+                assert isinstance(output_buffer, TensorBox), (
+                    "The output node  for flex attention's subgraph must be a TensorBox, but got: ",
+                    type(output_buffer),
+                )
+                assert isinstance(output_buffer.data, StorageBox), (
+                    "The output node for the flex attention subgraph must be a StorageBox, but got: ",
+                    type(output_buffer),
+                )
+                subgraph_buffer = ComputedBuffer(
+                    name=None,
+                    layout=FlexibleLayout(
+                        device=output_buffer.data.get_device(),
+                        dtype=output_buffer.data.get_dtype(),
+                        size=output_buffer.data.get_size(),
+                    ),
+                    data=output_buffer.data.data,  # type: ignore[arg-type]
+                )
+                return subgraph_buffer
+            # node.args[0] is either a single element or a list of elements
+            # representing all outputs of the function.
+            return tree_map(convert_output_node_to_buffer, node.args[0])
+    raise ValueError("FlexAttention was passed a subgraph with no output node!")
+# Inner Triton functions shared by flex_attention & split-k decoding kernels.
+compute_next_offset_func = r"""
+@triton.jit
+def get_offset_for_next_block(loop_iter, col_indices, total_blocks, SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK):
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+"""
+compute_flex_attention = r"""
+{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+    Z = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+    MATMUL_PRECISION = Q.dtype.element_ty
+    q_start = tl.program_id(0)
+    off_z = tl.program_id(1) // HQ
+    off_hq = tl.program_id(1) % HQ
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+    q_offset = off_z * stride_qz + off_hq * stride_qh
+    k_offset = off_z * stride_kz + off_hkv * stride_kh
+    v_offset = off_z * stride_vz + off_hkv * stride_vh
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_Q_BLOCK_CNT: tl.constexpr = tl.cdiv(Q_LEN, SPARSE_Q_BLOCK_SIZE)
+    SPARSE_KV_BLOCK_CNT: tl.constexpr = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * SPARSE_Q_BLOCK_CNT * SPARSE_KV_BLOCK_CNT + (q_start // SPARSE_Q_MULTIPLE) * SPARSE_KV_BLOCK_CNT  # noqa: B950
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q,
+        shape=(Q_LEN, QK_HEAD_DIM),
+        strides=(stride_qm, stride_qk),
+        offsets=(q_start * BLOCK_M, 0),
+        block_shape=(BLOCK_M, QK_HEAD_DIM),
+        order=(1, 0)
+    )
+    # load q: it stays in SRAM throughout the inner loop.
+    if IS_DIVISIBLE:
+        q = tl.load(Q_block_ptr)
+    else:
+        # boundary check is not free, so we only do it when necessary.
+        q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option = "zero")
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    K_block_ptr = tl.make_block_ptr(
+        base=K,
+        shape=(QK_HEAD_DIM, KV_LEN),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, kv_start),
+        block_shape=(QK_HEAD_DIM, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V,
+        shape=(KV_LEN, V_HEAD_DIM),
+        strides=(stride_vn, stride_vk),
+        offsets=(kv_start, 0),
+        block_shape=(BLOCK_N, V_HEAD_DIM),
+        order=(1, 0)
+    )
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_z, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        K_block_ptr = tl.make_block_ptr(
+            base=K,
+            shape=(QK_HEAD_DIM, KV_LEN),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, kv_start),
+            block_shape=(QK_HEAD_DIM, BLOCK_N),
+            order=(0, 1)
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(kv_start, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM),
+            order=(1, 0)
+        )
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_z, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+    acc = acc / l_i[:, None]
+    idx_z = tl.program_id(1) // HQ
+    idx_hq = tl.program_id(1) % HQ
+    idx_m = offs_m[:, None]
+    idx_d = tl.arange(0, V_HEAD_DIM)[None, :]
+    mask = idx_m < Q_LEN
+    # TODO generalize and add proper mask support
+    {{store_output(("idx_z", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+    # TODO dont want to write this if we dont require grad
+    if OUTPUT_LOGSUMEXP:
+        off_hz = tl.program_id(1)
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+ """
+compute_forward_inner = r"""
+@triton.jit
+def forward_inner(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+        # update pointers
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N
+        )
+        V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+        offs_n = offs_n + offset
+    return acc, l_i, m_i
+"""
+compute_forward_block_mn = r"""
+@triton.jit
+def forward_block_mn(
+    {{gen_argdefs()}},
+    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+    # -- load k --
+    if IS_DIVISIBLE:
+        k = tl.load(K_block_ptr)
+    else:
+        k = tl.load(K_block_ptr, boundary_check=(1,), padding_option = "zero")
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    if CHECK_BLOCK_BOUNDARY:
+        # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+        # which is larger than the actual number of elements. To avoid access memory out of bound,
+        # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+        m = offs_m % Q_LEN
+        n = offs_n % KV_LEN
+    else:
+        m = offs_m
+        n = offs_n
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_h",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=1,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_h",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, float("-inf"))
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # TODO: In the case that score_mod is linear, this can be LICMed
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    if IS_DIVISIBLE:
+        v = tl.load(V_block_ptr)
+    else:
+        v = tl.load(V_block_ptr, boundary_check=(0,), padding_option = "zero")
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+    # -- update m_i
+    m_i = m_ij
+    return acc, l_i, m_i
+"""
+flex_attention_template = TritonTemplate(
+    name="flex_attention",
+    grid=flex_attention_grid,
+    source=compute_flex_attention
+    + compute_forward_inner
+    + compute_next_offset_func
+    + compute_forward_block_mn,
+)
+def _use_flex_decoding(query, kernel_options):
+    # Decide which kernel to use, return true if use flex decoding kernel.
+    return (
+        not kernel_options.get("FORCE_USE_FLEX_ATTENTION", False)
+    ) and V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 128))
+_h100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (32, 64, 4, 3),
+    (torch.float32, 256): (32, 32, 4, 3),
+    (torch.bfloat16, 64): (128, 128, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (64, 32, 4, 3),
+    (torch.float16, 64): (128, 128, 4, 3),
+    (torch.float16, 128): (128, 128, 8, 3),
+    (torch.float16, 256): (64, 32, 4, 3),
+}
+_a100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (128, 32, 4, 3),
+    (torch.float32, 256): (64, 16, 4, 3),
+    (torch.bfloat16, 64): (128, 64, 4, 3),
+    (torch.bfloat16, 128): (128, 64, 8, 3),
+    (torch.bfloat16, 256): (32, 64, 4, 3),
+    (torch.float16, 64): (128, 64, 4, 3),
+    (torch.float16, 128): (128, 64, 8, 3),
+    (torch.float16, 256): (32, 64, 4, 3),
+}
+def _get_default_config_fwd(query) -> Tuple[int, int, int, int]:
+    dtype = query.get_dtype()
+    head_dim = query.get_size()[-1]
+    default_config = None
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _h100_default_config.get((dtype, head_dim), default_config)
+    elif head_dim <= 256 and torch.cuda.get_device_capability() >= (8, 0):  # A100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _a100_default_config.get((dtype, head_dim), default_config)
+    else:  # modest hardware or extremely large head_dim
+        if dtype == torch.float32:
+            default_config = (32, 16, 4, 3)
+        else:
+            default_config = (64, 32, 4, 3)
+    return default_config
+def _get_default_config_bwd(query) -> Tuple[int, int, int, int]:
+    head_dim = query.get_size()[-1]
+    dtype = query.get_dtype()
+    if dtype == torch.float32:
+        return (16, 16, 4, 1)
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if head_dim == 64:
+            return (64, 64, 4, 3)
+        elif head_dim == 128:
+            return (64, 128, 8, 3)
+        else:
+            return (64, 64, 4, 2)
+    elif torch.cuda.get_device_capability() >= (8, 0):  # A100
+        if head_dim == 64:
+            return (32, 128, 4, 3)
+        elif head_dim == 128:
+            return (64, 128, 8, 3)
+        else:
+            return (64, 64, 4, 2)
+    else:  # modest hardware or extremely large head_dim
+        return (16, 16, 4, 1)
+def create_num_blocks_fake_generator(sparse_indices):
+    # The idea here is that we need to create a real tensor with real data
+    # that's representative for benchmarking.
+    # For example, returning all zeros for the `kv_num_blocks` input would mean
+    # that we are computing 0 blocks for each row, which would provide bogus
+    # autotuning results.
+    #
+    # In this case, we choose to use min(16, max_block) blocks, because I
+    # (Horace) think it'll probably result in pretty representative performance.
+    # If it's too short then prefetching won't help. If it's too long then
+    # autotuning will take longer for no good reason.
+    def create_num_blocks_fake(x) -> torch.Tensor:
+        num_blocks_for_autotuning = min(16, sparse_indices.shape[-1])
+        return torch.full(
+            x.get_size(),
+            int(num_blocks_for_autotuning),
+            dtype=x.get_dtype(),
+            device=x.get_device(),
+        )
+    return create_num_blocks_fake
+def create_indices_fake(x) -> torch.Tensor:
+    indices = torch.arange(
+        0, int(x.get_size()[-1]), dtype=x.get_dtype(), device=x.get_device()
+    )
+    indices = indices.expand(x.get_size()).contiguous()
+    return indices
+from torch._inductor.kernel.flex_decoding import create_flex_decoding_kernel
+# TODO: We probably also need a layout constraint?
+@register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None)
+def flex_attention(
+    query,
+    key,
+    value,
+    subgraph,
+    block_mask,
+    scale,
+    kernel_options,
+    score_mod_other_buffers,
+    mask_mod_other_buffers,
+):
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        SPARSE_Q_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+    placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("score", query.get_dtype()),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    subgraph_buffer = build_subgraph_buffer(
+        placeholder_inps + list(score_mod_other_buffers), subgraph
+    )
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+    kernel_options = dict(kernel_options)
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    if _use_flex_decoding(query, kernel_options):
+        return create_flex_decoding_kernel(
+            query,
+            key,
+            value,
+            block_mask,
+            scale,
+            kernel_options,
+            subgraph_buffer,
+            mask_graph_buffer,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+    # Reuse query strides for output layout despite different last dimension.
+    # This works because only the last dim differs and we check it is contiguous.
+    q_strides = query.get_stride()
+    assert q_strides[-1] == 1, "Query must be contiguous in the last dimension"
+    # Construct output layout with strides matching the query.
+    out_size = [B, Hq, seq_len_q, v_head_dim]
+    stride_order = get_stride_order(query.get_stride())
+    fill_order = stride_order2fill_order(stride_order)
+    out_strides = construct_strides(out_size, fill_order)
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        [B, Hq, seq_len_q, v_head_dim],
+        stride=out_strides,
+    )
+    # see NOTE:[TritonTemplates with multiple outputs]
+    logsumexp_shape = [B, Hq, seq_len_q]
+    logsumexp = empty_strided(
+        logsumexp_shape,
+        None,
+        dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    kernel_options.setdefault("SM_SCALE", scale)
+    # Determine GQA broadcast factor.
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is None if partial blocks are not computed
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_fwd(query))
+    if config.max_autotune:
+        configs += [
+            (128, 64, 4, 3),
+            (128, 128, 4, 3),
+            (128, 128, 8, 2),
+            (64, 128, 4, 3),
+            (64, 64, 4, 3),
+        ]
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+    for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
+        if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0 or SPARSE_Q_BLOCK_SIZE % BLOCK_M != 0:
+            continue
+        # Work around https://github.com/pytorch/pytorch/issues/129625
+        if num_stages == 2:
+            continue
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_M", BLOCK_M)
+        kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        # Blocksparse options
+        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        flex_attention_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout,
+            subgraphs=[
+                subgraph_buffer,
+                mask_graph_buffer,
+            ],
+            mutated_inputs=[
+                logsumexp,
+            ],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            call_sizes=query.get_size(),
+            **kernel_options,
+        )
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        4: create_num_blocks_fake_generator(kv_indices),
+        5: create_indices_fake,
+        6: create_num_blocks_fake_generator(full_kv_indices),
+        7: create_indices_fake,
+    }
+    return (
+        autotune_select_algorithm(
+            "flex_attention",
+            choices,
+            inputs_for_autotuning,
+            layout,
+            input_gen_fns=input_gen_fns,
+        ),
+        logsumexp,
+    )
+# ---------------------------- Backward HOP Implementation ----------------------------
+def flex_attention_backward_grid(
+    batch_size, q_heads, num_queries, d_model, kv_heads, num_key_value, meta
+):
+    """How is this kernel parallelized?
+    Currently this is only parallelizing over batch* kv_heads, but we can, and want to
+    parallelize over ceil_div(q_heads//kv_heads * num_key_value, key_value_block_size).
+    To do this will either require atomic updates to some grad values or to have a two pass kernel design.
+    """
+    import triton
+    return (
+        triton.cdiv(num_queries, meta["BLOCK_M2"]) * (q_heads // kv_heads)
+        + triton.cdiv(num_key_value, meta["BLOCK_N1"]),
+        1,
+        batch_size * kv_heads,
+    )
+flex_attention_backward_template = TritonTemplate(
+    name="flex_attention_backward",
+    grid=flex_attention_backward_grid,
+    source=r"""
+{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}}
+    stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}}
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}}
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}}
+    Z = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    HKV = {{size("K", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+    MATMUL_PRECISION = Q.dtype.element_ty
+    pid = tl.program_id(0)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+    off_hz = tl.program_id(2)
+    off_z = off_hz // HKV # batch idx
+    off_hkv = off_hz % HKV # kv head idx
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+    sparse_idx_z = off_z % SPARSE_Z
+    k_adj = (stride_kh * off_hkv + stride_kz * off_z).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_z).to(tl.int64)
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_z).to(tl.int64)
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+        stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+        stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_z).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_z).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_z).to(tl.int64)
+        off_chz2 = ((off_z * HQ + off_hq2) * Q_LEN).to(tl.int64)
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        if IS_DIVISIBLE:
+            q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd)
+            do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod)
+        else:
+            q = tl.load(Q2 + offs_m2[:, None] * stride_qm + offs_k[None, :] * stride_qd, mask=offs_m2[:, None] < Q_LEN)
+            do = tl.load(DO2 + offs_m2[:, None] * stride_dom + offs_v[None, :] * stride_dod, mask=offs_m2[:, None] < Q_LEN)
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            {{gen_argdefs()}},
+            K, V,
+            dq, q, do, Di, lse,
+            off_z, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                {{gen_argdefs()}},
+                K, V,
+                dq, q, do, Di, lse,
+                off_z, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=offs_m2[:, None] < Q_LEN)
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+        stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}}
+        stride_q_idx_h = {{stride("Q_IDX", 1)}}
+        stride_q_idx_n = {{stride("Q_IDX", 2)}}
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM], dtype=tl.float32)
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+        # load K and V: they stay in SRAM throughout the inner loop.
+        if IS_DIVISIBLE:
+            k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd)
+            v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd)
+        else:
+            k = tl.load(K + offs_n1[:, None] * stride_kn + offs_k[None, :] * stride_kd, mask=offs_n1[:, None] < KV_LEN)
+            v = tl.load(V + offs_n1[:, None] * stride_vn + offs_v[None, :] * stride_vd, mask=offs_n1[:, None] < KV_LEN)
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_z).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_z).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_z).to(tl.int64)
+            off_chz1 = ((off_z * HQ + off_hq1) * Q_LEN).to(tl.int64)
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                {{gen_argdefs()}},
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_z, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    {{gen_argdefs()}},
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_z, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        if IS_DIVISIBLE:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=index_n < KV_LEN)
+        dk *= SM_SCALE
+        mask = index_n < KV_LEN
+        {{store_output(("off_z", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8)}}
+@triton.jit
+def bwd_dq_inner(
+    {{gen_argdefs()}},
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_n in range(0, hi - 1):
+                dq = bwd_dq_block_mn(
+                    {{gen_argdefs()}},
+                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_m2, offs_n2,
+                    stride_kn, stride_kd, stride_vn, stride_vd,
+                    kv_indices, sparse_kv_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_n, kv_indices, sparse_kv_num_blocks,
+                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2
+                )
+                kT_ptrs += offset * stride_kn
+                vT_ptrs += offset * stride_vn
+                offs_n2 += offset
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_n in range(0, hi):
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_n, kv_indices, sparse_kv_num_blocks,
+                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2
+            )
+            kT_ptrs += offset * stride_kn
+            vT_ptrs += offset * stride_vn
+            offs_n2 += offset
+    return dq
+@triton.jit
+def bwd_dq_block_mn(
+    {{gen_argdefs()}},
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1)}}
+    if IS_DIVISIBLE:
+        kT = tl.load(kT_ptrs)
+    else:
+        kT = tl.load(kT_ptrs, mask=offs_n2[None, :] < KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    if CHECK_BLOCK_BOUNDARY:
+        m = offs_m2[:, None] % Q_LEN
+        n = offs_n2[None, :] % KV_LEN
+    else:
+        m = offs_m2[:, None]
+        n = offs_n2[None, :]
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf"))
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    if IS_DIVISIBLE:
+        vT = tl.load(vT_ptrs)
+    else:
+        vT = tl.load(vT_ptrs, mask=offs_n2[None, :] < KV_LEN)
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="ds"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+    ds = grad_scores
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+    return dq
+@triton.jit
+def bwd_dkdv_inner(
+    {{gen_argdefs()}},
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+    offs_k = tl.arange(0, QK_HEAD_DIM)
+    offs_v = tl.arange(0, V_HEAD_DIM)
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_m in range(0, hi - 1):
+                dk, dv = bwd_dkdv_block_mn(
+                    {{gen_argdefs()}},
+                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_m, q_indices, sparse_q_num_blocks,
+                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1
+                )
+                qT_ptrs += offset * stride_qm
+                do_ptrs += offset * stride_dom
+                offs_m1 += offset
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_m in range(0, hi):
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_m, q_indices, sparse_q_num_blocks,
+                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1
+            )
+            qT_ptrs += offset * stride_qm
+            do_ptrs += offset * stride_dom
+            offs_m1 += offset
+    return dk, dv
+@triton.jit
+def bwd_dkdv_block_mn(
+    {{gen_argdefs()}},
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        qT = tl.load(qT_ptrs)
+        lse = tl.load(LSE + offs_m1)
+    else:
+        qT = tl.load(qT_ptrs, mask=offs_m1[None, :] < Q_LEN)
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    if CHECK_BLOCK_BOUNDARY:
+        m = offs_m1[None, :] % Q_LEN
+        n = offs_n1[:, None] % KV_LEN
+    else:
+        m = offs_m1[None, :]
+        n = offs_n1[:, None]
+    pre_mod_scores = qkT
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qkT",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qkT"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qkT",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    if IS_DIVISIBLE:
+        do = tl.load(do_ptrs)
+    else:
+        do = tl.load(do_ptrs, mask=offs_m1[:, None] < Q_LEN)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="dsT"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, float("-inf"))
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+    return dk, dv
+ """
+    + compute_next_offset_func,
+)
+# TODO: We probably also need a layout constraint?
+@register_lowering(
+    torch.ops.higher_order.flex_attention_backward, type_promotion_kind=None
+)
+def flex_attention_backward(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        grad_logsumexp,
+        fw_graph,
+        joint_graph,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        SPARSE_Q_BLOCK_SIZE,
+        mask_graph,
+    ) = block_mask
+    (
+        query,
+        key,
+        value,
+        grad_out,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        q_num_blocks,
+        q_indices,
+        full_q_num_blocks,
+        full_q_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            grad_out,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+    )
+    device = query.get_device()
+    dtype = query.get_dtype()
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+    kernel_options = dict(kernel_options)
+    kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+    fwd_placeholder_inps = [
+        create_placeholder(name, dtype, device)
+        for name, dtype in [
+            ("score", dtype),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    fw_subgraph_buffer = build_subgraph_buffer(
+        fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph
+    )
+    joint_placeholder_inps = fwd_placeholder_inps + [
+        create_placeholder("grad_score_mod", dtype, device)
+    ]
+    joint_subgraph_buffer, *_ = build_subgraph_buffer(
+        joint_placeholder_inps + list(score_mod_other_buffers), joint_graph
+    )
+    mask_graph_placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    mask_graph_buffer = build_subgraph_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+    )
+    layout_k = FixedLayout(
+        key.get_device(),
+        key.get_dtype(),
+        key.get_size(),
+        key.get_stride(),
+    )
+    # Create delta which will is needed for the bwd's kernel
+    grad_lse_exp2 = lowerings[aten.mul](grad_logsumexp, 1 / math.log(2))
+    mul_delta = lowerings[aten.mul](out, grad_out)
+    delta = lowerings[aten.sum](mul_delta, axis=-1)
+    delta = lowerings[aten.sub](delta, grad_lse_exp2)
+    delta = ExternKernel.require_contiguous(delta)
+    grad_lse_exp2, delta = maybe_realize([grad_lse_exp2, delta])
+    # see NOTE:[TritonTemplates with multiple outputs]
+    grad_query = empty_strided(
+        query.get_size(), query.get_stride(), dtype=dtype, device=device
+    )
+    grad_value = empty_strided(
+        value.get_size(), value.get_stride(), dtype=dtype, device=device
+    )
+    kernel_options.setdefault("SM_SCALE", scale)
+    # Determine GQA factor
+    gqa_shared_heads = Hq // Hkv
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+    # Inside of Triton kernel, only apply partial masking if partial blocks are computed.
+    # full_kv_num_blocks is torch.zeros([1, 1, 1]) if partial blocks are not computed.
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        full_kv_num_blocks, full_kv_indices, full_q_num_blocks, full_q_indices = (
+            empty(0, device=query.get_device()) for _ in range(4)
+        )
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_bwd(query))
+    if config.max_autotune:
+        configs.extend(
+            [
+                (BLOCK1, BLOCK2, w, s)
+                for BLOCK1 in [32, 64]
+                for BLOCK2 in [32, 64, 128]
+                for w in [4, 8]
+                for s in [1, 3, 4, 5]
+                if BLOCK2 % BLOCK1 == 0
+            ]
+        )
+    for BLOCK1, BLOCK2, num_warps, num_stages in configs:
+        if (
+            SPARSE_KV_BLOCK_SIZE % BLOCK1 != 0
+            or SPARSE_Q_BLOCK_SIZE % BLOCK1 != 0
+            or SPARSE_KV_BLOCK_SIZE % BLOCK2 != 0
+            or SPARSE_Q_BLOCK_SIZE % BLOCK2 != 0
+        ):
+            continue
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_M1", BLOCK1)
+        kernel_options.setdefault("BLOCK_N1", BLOCK2)
+        kernel_options.setdefault("BLOCK_M2", BLOCK2)
+        kernel_options.setdefault("BLOCK_N2", BLOCK1)
+        # Blocksparse options
+        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        flex_attention_backward_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                logsumexp,
+                delta,
+                grad_out,
+                grad_query,
+                grad_value,
+                kv_num_blocks,
+                kv_indices,
+                q_num_blocks,
+                q_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+                full_q_num_blocks,
+                full_q_indices,
+            ],
+            layout=layout_k,  # We use store_output only for grad_key
+            subgraphs=[fw_subgraph_buffer, joint_subgraph_buffer, mask_graph_buffer],
+            mutated_inputs=[grad_query, grad_value],
+            call_sizes=query.get_size() + key.get_size()[1:3],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            **kernel_options,
+        )
+    inputs_for_autotuning = (
+        [
+            query,
+            key,
+            value,
+            logsumexp,
+            delta,
+            grad_out,
+            grad_query,
+            grad_value,
+            kv_num_blocks,
+            kv_indices,
+            q_num_blocks,
+            q_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+            full_q_num_blocks,
+            full_q_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        8: create_num_blocks_fake_generator(kv_indices),  # kv_num_blocks
+        9: create_indices_fake,
+        10: create_num_blocks_fake_generator(q_indices),  # q_num_blocks
+        11: create_indices_fake,
+        12: create_num_blocks_fake_generator(full_kv_indices),  # full_kv_num_blocks
+        13: create_indices_fake,
+        14: create_num_blocks_fake_generator(full_q_indices),  # full_q_num_blocks
+        15: create_indices_fake,
+    }
+    grad_key = autotune_select_algorithm(
+        "flex_attention_backward",
+        choices,
+        inputs_for_autotuning,
+        layout_k,
+        input_gen_fns=input_gen_fns,
+    )
+    return (
+        grad_query,
+        grad_key,
+        grad_value,
+    )

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/flex_decoding.py ADDED Viewed

	@@ -0,0 +1,570 @@

+# mypy: allow-untyped-defs
+""" Triton Implementation of the flex_attention Kernel for short query length (FlexDecoding)"""
+from typing import Any, List, Tuple
+import sympy
+import torch
+from torch._inductor.virtualized import V
+from .. import config, ir
+from ..ir import FixedLayout, FlexibleLayout
+from ..lowering import empty, empty_strided, lowerings
+from ..runtime.runtime_utils import is_power_of_2, next_power_of_2
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
+from .flex_attention import (
+    compute_forward_block_mn,
+    compute_forward_inner,
+    compute_next_offset_func,
+    create_indices_fake,
+    create_num_blocks_fake_generator,
+    maybe_realize,
+)
+aten = torch.ops.aten
+prims = torch.ops.prims
+def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * kv_heads, SPLIT_KV, 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the local output for their tile of keys and values over all full length of query.
+    groups of SPLIT_KV blocks then combine their output to produce the final result.
+    """
+    return (batch_size * kv_heads, meta["SPLIT_KV"], 1)
+flex_decoding_template = TritonTemplate(
+    name="flex_decoding",
+    grid=flex_decoding_grid,
+    source=r"""
+    {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+    stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}}
+    stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}}
+    Z = {{size("Q", 0)}}
+    HKV = {{size("Q", 1)}}
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = {{size("Q", 3)}}
+    KV_LEN = {{size("K", 2)}}
+    MATMUL_PRECISION = Q.dtype.element_ty
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+    off_z = tl.program_id(0) // HKV
+    off_hkv = tl.program_id(0) % HKV
+    off_t = tl.program_id(1)
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_z * stride_kz + off_hkv * stride_kh
+    v_offset = off_z * stride_vz + off_hkv * stride_vh
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+    sparse_idx_z = off_z % SPARSE_Z
+    # TODO: support masks not broadcasted along the head dimension.
+    tl.device_assert(SPARSE_HQ == 1)
+    sparse_idx_h = 0
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32)
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM)
+    offs_vd = tl.arange(0, V_HEAD_DIM)
+    # KV_IDX / FULL_KV_IDX and KV_NUM_BLKS / FULL_KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_h
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+    if SAFE_M_BOUNDARY:
+        q = tl.load(Q + q_offset + q_range)
+    else:
+        mask = off_m[None, :, None] < Q_LEN
+        q = tl.load(Q + q_offset + q_range, mask)
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM])
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # Apply both score_mod and mask_mod
+    # find first kv block we are loading and the number of blocks we are loading
+    kv_indices = KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_hz_offset)
+    indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+        strides=(stride_kk, stride_kn),
+        offsets=(0, off_n),
+        block_shape=(QK_HEAD_DIM, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(KV_LEN, V_HEAD_DIM),
+        strides=(stride_vn, stride_vk),
+        offsets=(off_n, 0),
+        block_shape=(BLOCK_N, V_HEAD_DIM),
+        order=(1, 0)
+    )
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        IS_FULL_BLOCKS=False,
+    )
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_hz_offset * SPARSE_KV_BLOCK_CNT
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_hz_offset)
+        indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        K_block_ptr = tl.make_block_ptr(
+            base=K + k_offset,
+            shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
+            strides=(stride_kk, stride_kn),
+            offsets=(0, off_n),
+            block_shape=(QK_HEAD_DIM, BLOCK_N),
+            order=(0, 1)
+        )
+        V_block_ptr = tl.make_block_ptr(
+            base=V + v_offset,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(off_n, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM),
+            order=(1, 0)
+        )
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=True,
+        )
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+    mask = (idx_m < Q_LEN)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
+ """
+    + compute_forward_inner
+    + compute_next_offset_func
+    + compute_forward_block_mn,
+)
+def get_split_k(B: int, H: int, Mk: int, SM: int = 128) -> int:
+    """Heuristic for the number of splits from xformer"""
+    bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
+    split_k = SM // bh  # Each SM should at least get one block.
+    split_k = max(split_k, 1)
+    return split_k
+def _get_decoding_default_config(key) -> Tuple[int, int, int]:
+    dtype = key.get_dtype()
+    head_dim = key.get_size()[-1]
+    sm_version = torch.cuda.get_device_capability()
+    default_config = (64, 2, 1)
+    if sm_version >= (9, 0):
+        if head_dim > 128 and dtype == torch.float32:
+            return default_config
+        return (64, 2, 3)
+    return default_config
+def create_flex_decoding_kernel(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        block_mask,
+        scale,
+        kernel_options,
+        score_mod_subgraph,
+        mask_mod_subgraph,
+        score_mod_other_buffers,
+        mask_mod_other_buffers,
+    ) = args
+    (
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,  # full_kv_num_blocks,
+        full_kv_indices,  # full_kv_indices,
+        _,  # q_num_blocks
+        _,  # q_indices
+        _,  # full_q_num_blocks,
+        _,  # full_q_indices,
+        SPARSE_KV_BLOCK_SIZE,
+        _,  # SPARSE_Q_BLOCK_SIZE,
+        _,
+    ) = block_mask
+    Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
+    assert Bq == Bkv, "Batch dimension must match"
+    B = Bq
+    kernel_options = dict(kernel_options)
+    # TODO: Fix flex decoding non-divisible case!
+    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", True)
+    # Calculate GQA head sharing
+    gqa_shared_heads = Hq // Hkv
+    if not is_power_of_2(gqa_shared_heads):
+        raise ValueError(
+            "Number of shared query heads sharing the same KV head must be power of 2. "
+        )
+    kernel_options.setdefault("GQA_SHARED_HEADS", gqa_shared_heads)
+    # Determine if there are "full" blocks where we only need to apply score_mod, and can skip mask_mod
+    has_full_blocks = full_kv_num_blocks is not None
+    kernel_options.setdefault("HAS_FULL_BLOCKS", has_full_blocks)
+    if not has_full_blocks:
+        # Create a plackeholder full block list in case it is empty
+        full_kv_num_blocks, full_kv_indices = (
+            empty(0, device=query.get_device()) for _ in range(2)
+        )
+    (
+        query,
+        key,
+        value,
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+    ) = maybe_realize(
+        [
+            query,
+            key,
+            value,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+    )
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int]] = []
+    configs.append(_get_decoding_default_config(key))
+    # Note: max_autotune is not supported yet. Causes error in lowering the dynamic shape in reduction ops.
+    if config.max_autotune:
+        configs += [
+            (64, 2, 2),
+            (32, 2, 3),
+            (128, 2, 3),
+        ]
+    # TODO: fix autotuning.
+    kernel_options.setdefault("SM_SCALE", scale)
+    kernel_options.setdefault("SPLIT_KV", get_split_k(B, Hkv, seq_len_kv))
+    MAX_SPLIT_KV = kernel_options["SPLIT_KV"]
+    # create config dependent intermediate buffers
+    buf_ACC_shape = [B, MAX_SPLIT_KV, Hq, seq_len_q, v_head_dim]
+    buf_ML_shape = buf_ACC_shape[:-1]
+    buf_M = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The rowmax is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    buf_L = empty_strided(
+        buf_ML_shape,
+        None,
+        dtype=torch.float32,  # The intermediate sumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    layout_acc = FixedLayout(
+        query.get_device(),
+        torch.float32,
+        buf_ACC_shape,
+        FlexibleLayout.contiguous_strides(buf_ACC_shape),
+    )
+    kernel_options.setdefault("QK_HEAD_DIM", qk_head_dim)
+    kernel_options.setdefault("V_HEAD_DIM", v_head_dim)
+    kernel_options.setdefault(
+        "BLOCK_M",
+        (
+            # m
+            # if V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 0))
+            # else  # Always use a BLOCK_M > 16 before Triton fix https://github.com/triton-lang/triton/pull/4061 is in pin
+            max(
+                next_power_of_2(
+                    V.graph.sizevars.size_hint(
+                        seq_len_q, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+                    )
+                    * gqa_shared_heads
+                ),
+                16,
+            )
+        ),
+    )
+    query = ir.ExternKernel.realize_input(query)
+    stride_b, stride_hq, stride_seq_len_q, stride_qk_head_dim = query.get_stride()
+    # Reshape query for GQA: [B, Hq, Mq, D] -> [B, Hkv, G, Mq, D]
+    gqa_query_shape = (B, Hkv, gqa_shared_heads, seq_len_q, qk_head_dim)
+    gqa_query_stride = (
+        stride_b,
+        stride_hq * gqa_shared_heads,
+        stride_hq,
+        stride_seq_len_q,
+        stride_qk_head_dim,
+    )
+    query = lowerings[aten.as_strided](query, gqa_query_shape, gqa_query_stride)
+    V.graph.sizevars.guard_leq(
+        seq_len_q * gqa_shared_heads, sympy.Integer(kernel_options["BLOCK_M"])
+    )
+    kernel_options.setdefault(
+        "SAFE_M_BOUNDARY",
+        ((seq_len_q * gqa_shared_heads) % kernel_options["BLOCK_M"]) == 0,
+    )
+    # TODO: This feels sketchy
+    kernel_options.setdefault("SAFE_N_BOUNDARY", True)
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+    for BLOCK_N, num_warps, num_stages in configs:
+        if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0:
+            continue
+        # Performance tuning
+        kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        # Work around https://github.com/pytorch/pytorch/issues/129625
+        if num_stages == 2:
+            continue
+        flex_decoding_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                buf_M,
+                buf_L,
+                kv_num_blocks,
+                kv_indices,
+                full_kv_num_blocks,
+                full_kv_indices,
+            ],
+            layout=layout_acc,
+            subgraphs=[
+                score_mod_subgraph,
+                mask_mod_subgraph,
+            ],
+            mutated_inputs=[buf_M, buf_L],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            call_sizes=query.get_size(),
+            **kernel_options,
+        )
+    inputs_for_flex_decoding = (
+        [
+            query,
+            key,
+            value,
+            buf_M,
+            buf_L,
+            kv_num_blocks,
+            kv_indices,
+            full_kv_num_blocks,
+            full_kv_indices,
+        ]
+        + list(score_mod_other_buffers)
+        + list(mask_mod_other_buffers)
+    )
+    input_gen_fns = {
+        5: create_num_blocks_fake_generator(kv_indices),
+        6: create_indices_fake,
+        7: create_num_blocks_fake_generator(full_kv_indices),
+        8: create_indices_fake,
+    }
+    buf_ACC = autotune_select_algorithm(
+        "flex_decoding",
+        choices,
+        inputs_for_flex_decoding,
+        layout_acc,
+        input_gen_fns=input_gen_fns,
+    )
+    # Reduction
+    g_M = lowerings[aten.max](buf_M, dim=1, keepdim=True)[0]
+    # See [Note] Handle fully masked out rows:
+    # g_M Is the global max among split kv blocks.
+    masked_rows = lowerings[aten.eq](g_M, -float("inf"))
+    adj_M = lowerings[aten.sub](buf_M, g_M)
+    adj_M = lowerings[aten.where](masked_rows, 0, adj_M)
+    alpha = lowerings[aten.exp2](adj_M)
+    buf_L = lowerings[aten.mul](buf_L, alpha)
+    g_L = lowerings[aten.sum](buf_L, axis=1)
+    masked_rows_squeezed = lowerings[aten.squeeze](masked_rows, dim=1)
+    g_L = lowerings[aten.where](masked_rows_squeezed, 1.0, g_L)
+    logsumexp = lowerings[aten.log2](g_L)
+    logsumexp = lowerings[aten.add](logsumexp, lowerings[aten.squeeze](g_M, dim=1))
+    alpha_unseq = lowerings[aten.unsqueeze](alpha, 4)
+    buf_ACC = lowerings[aten.mul](buf_ACC, alpha_unseq)
+    output = lowerings[aten.sum](buf_ACC, axis=1)
+    L_unseq = lowerings[aten.unsqueeze](g_L, 3)
+    output = lowerings[aten.div](output, L_unseq)
+    output = lowerings[prims.convert_element_type](output, query.get_dtype())
+    return (
+        output,
+        logsumexp,
+    )

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# mypy: allow-untyped-defs
+import functools
+import logging
+from typing import Any, Dict, List, Optional
+import torch
+from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    context_add_strides,
+    context_add_using_tf32,
+    get_mixedmm_precondition,
+    mixed_mm_operations,
+    mm_operations,
+)
+from torch._inductor.codegen.cpp_gemm_template import CppPackedGemmTemplate
+from torch._inductor.virtualized import V
+from .. import config as inductor_config
+from ..codegen.common import BackendFeature
+from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
+from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
+from ..codegen.wrapper import WrapperCodeGen
+from ..ir import FlexibleLayout, is_triton
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    NoValidChoicesError,
+    TritonTemplate,
+)
+from ..utils import (
+    get_gpu_shared_memory,
+    use_aten_gemm_kernels,
+    use_ck_template,
+    use_cpp_packed_gemm_template,
+    use_cutlass_template,
+    use_max_autotune,
+    use_triton_template,
+)
+from .mm_common import (
+    addmm_epilogue,
+    extra_mm_configs,
+    int8_mm_configs,
+    mixed_mm_configs,
+    mm_args,
+    mm_configs,
+    mm_grid,
+    mm_options,
+    triton_config,
+)
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+mm_template = TritonTemplate(
+    name="mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+aten_addmm = ExternKernelChoice(
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+)
+aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm")
+aten__sparse_semi_structured_mm = ExternKernelChoice(
+    torch._sparse_semi_structured_mm,
+    "at::_sparse_semi_structured_mm",
+    has_out_variant=False,
+)
+def _is_int8_mat(mat):
+    return mat.get_dtype() in (torch.int8, torch.uint8)
+def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
+    """
+    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
+    kernel under the hood.  There are a few shapes where this is slower,
+    but they are rare.
+    """
+    if inp.stride(0) == 0 or inp.size(0) == 1:
+        return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
+    return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
+aten_bias_addmm = ExternKernelChoice(bias_addmm, None)
+@register_lowering(aten.mm, type_promotion_kind=None)
+def tuned_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    name = "mm"
+    aten_layout = layout
+    if not use_max_autotune():
+        aten_layout = FlexibleLayout(
+            device=layout.device, dtype=layout.dtype, size=layout.size
+        )
+    # options to tune from
+    choices = (
+        [aten_mm.bind((mat1, mat2), aten_layout)] if use_aten_gemm_kernels() else []
+    )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+    if is_nonzero and use_ck_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+    if use_cpp_packed_gemm_template(layout, mat1, mat2):
+        CppPackedGemmTemplate.add_choices(
+            choices,
+            layout,
+            [mat1, mat2],
+        )
+    input_nodes = [mat1, mat2]
+    if (
+        is_nonzero
+        and use_triton_template(layout)
+        and torch._inductor.config.run_autoheuristic(name)
+        and is_triton(mat1)
+    ):
+        always_included = []
+        if use_aten_gemm_kernels():
+            always_included.append("extern_mm")
+        num_choices_before_extra_configs = len(choices)
+        for config in extra_mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+        # using AutoHeuristic for ranking
+        ah_choices = mm_autoheuristic(
+            mat1,
+            mat2,
+            m,
+            n,
+            k,
+            choices,
+            name,
+            input_nodes,
+            mm_operations(),
+            None,
+            top_k=10,
+            always_included=always_included,
+        )
+        if not torch._inductor.config.collect_autoheuristic(name):
+            # if we are collecting data, we do not want to modify choices
+            if ah_choices is not None and len(ah_choices) > 0:
+                # the order in which autoheuristic returns choices is not the same as
+                # as the order of choices, which affects things like epilogue fusion.
+                # once epilogue fusion benchmarks choices in sorted order, I think we can
+                # just use the order returned by autoheuristic
+                choices = [choice for choice in choices if choice in ah_choices]
+            else:
+                choices = choices[:num_choices_before_extra_configs]
+    if (
+        len(choices) == 0
+        and not use_aten_gemm_kernels()
+        and inductor_config.autotune_fallback_to_aten
+    ):
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
+    try:
+        return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
+def _is_static_problem(inputs_tensors, layout):
+    # checks whether all input tensors and the output layout
+    # have a static shape by attempting to convert the dimensions
+    # to int
+    static_shape = True
+    static_size = WrapperCodeGen.statically_known_list_of_ints_or_none(layout.size)
+    if static_size is None:
+        nonzero = True
+        for s in layout.size:
+            sz = WrapperCodeGen.statically_known_int_or_none(s)
+            if sz is not None and sz == 0:
+                nonzero = False
+                break
+        return False, nonzero
+    numel = 1
+    for dim in static_size:
+        numel *= dim
+    nonzero = numel > 0
+    return static_shape, nonzero
+@register_lowering(aten._int_mm, type_promotion_kind=None)
+def tuned_int_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=torch.int32
+    )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
+    choices = (
+        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    )
+    # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+    if use_cutlass or use_triton_template(layout, enable_int32=True):
+        choices = []
+    if use_cutlass:
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+    if is_nonzero and use_triton_template(layout, enable_int32=True):
+        for config in int8_mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    if len(choices) == 0:
+        log.warning(
+            "No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallback"
+        )
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+    try:
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+@register_lowering(aten.addmm, type_promotion_kind=None)
+def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
+    m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
+    static_shape, is_nonzero = _is_static_problem([inp, mat1, mat2], layout)
+    if (not is_nonzero) or (not use_max_autotune()):
+        # Use a FlexibleLayout if we are not autotuning.
+        # This allows padding strides for the output.
+        from torch._inductor.ir import FixedLayout, FlexibleLayout
+        if isinstance(layout, FixedLayout):
+            layout = FlexibleLayout(
+                device=layout.device, dtype=layout.dtype, size=layout.size
+            )
+        choices = (
+            [
+                aten_addmm.bind(
+                    (inp, mat1, mat2),
+                    layout,
+                    alpha=alpha,
+                    beta=beta,
+                )
+            ]
+            if use_aten_gemm_kernels()
+            else []
+        )
+        return autotune_select_algorithm("addmm", choices, [inp, mat1, mat2], layout)
+    choices = (
+        [
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                alpha=alpha,
+                beta=beta,
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if (
+        use_aten_gemm_kernels()
+        and inp_expanded.get_stride()[0] == 0
+        and inp_expanded.get_device().type == "cuda"
+        and inductor_config.triton.autotune_cublasLt
+    ):
+        # unexpand inp to make sure fused addmm from cublasLt is used
+        choices.insert(
+            0,
+            aten_bias_addmm.bind(
+                (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+            ),
+        )
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp_expanded, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        # Filter out a known cause of CUDA illegal memory access errors
+        # broadcasting on the last dim of the bias term seems not to be working
+        # in the linear GEMM epilogue used by addmm.
+        if (
+            WrapperCodeGen.statically_known_int_or_none(inp_expanded.layout.stride[-1])
+            != 0
+        ):
+            CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+                choices,
+                layout,
+                [mat1, mat2, inp_expanded],
+                alpha=alpha,
+                beta=beta,
+            )
+    if is_nonzero and use_ck_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(
+            choices,
+            layout,
+            [mat1, mat2, inp_expanded],
+            alpha=alpha,
+            beta=beta,
+        )
+    if use_cpp_packed_gemm_template(layout, mat1, mat2):
+        CppPackedGemmTemplate.add_choices(
+            choices,
+            layout,
+            [inp_expanded, mat1, mat2],
+            alpha=alpha,
+            beta=beta,
+            has_bias=True,
+        )
+    add_aten_fallback = False
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        add_aten_fallback = True
+    if add_aten_fallback:
+        choices.append(
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                ordered_kwargs_for_cpp_kernel,
+                alpha=alpha,
+                beta=beta,
+            )
+        )
+        if (
+            inp_expanded.get_stride()[0] == 0
+            and inp_expanded.get_device().type == "cuda"
+            and inductor_config.triton.autotune_cublasLt
+        ):
+            # unexpand inp to make sure fused addmm from cublasLt is used
+            choices.insert(
+                0,
+                aten_bias_addmm.bind(
+                    (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+                ),
+            )
+    try:
+        return autotune_select_algorithm(
+            "addmm", choices, [inp_expanded, mat1, mat2], layout
+        )
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        fallback_choice = aten_addmm.bind(
+            (inp, mat1, mat2),
+            layout,
+            ordered_kwargs_for_cpp_kernel,
+            alpha=alpha,
+            beta=beta,
+        )
+        return fallback_choice.output_node()
+@register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
+def tuned_sparse_semi_structured_mm(
+    mat1, mat1_meta, mat2, *, out_dtype=None, layout=None
+):
+    from torch._inductor.select_algorithm import realize_inputs
+    mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2)
+    m1, k1 = mat1.get_size()
+    m2, _ = mat1_meta.get_size()
+    k2, n = mat2.get_size()
+    m = V.graph.sizevars.guard_equals(m1, m2)
+    k = V.graph.sizevars.guard_equals(2 * k1, k2)
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+        layout = FixedLayout(
+            mat2.get_device(),
+            out_dtype if out_dtype else mat2.get_dtype(),
+            [m, n],
+            [n, 1],
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+    choices = (
+        [
+            aten__sparse_semi_structured_mm.bind(
+                (mat1, mat1_meta, mat2), layout, out_dtype=out_dtype
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if m * n != 0 and use_cutlass_template(layout, m, n, k):
+        CUTLASS2xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2, mat1_meta], fuseable=True, non_fuseable=True
+        )
+    return autotune_select_algorithm(
+        "sparse_semi_structured_mm", choices, [mat1, mat1_meta, mat2], layout
+    )
+def fallback_mixed_mm(mat1, mat2, *, out):
+    return torch.mm(mat1, mat2.to(mat1.dtype), out=out)
+aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None)
+@functools.lru_cache(None)
+def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
+    props = torch.cuda.get_device_properties(index or 0)
+    return props.major <= 7
+def dims_are_int(dims):
+    return all(isinstance(dim, int) for dim in dims)
+def try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout):
+    m, n, k = get_size_hints(mat1, mat2, m, n, k)
+    if not dims_are_int([m, n, k]):
+        return None
+    if mat1.dtype != torch.float16:
+        return None
+    # only use heuristic if we are running on an A100
+    # torch.cuda.get_device_capability() >= (8, 0) returns true for A10G
+    # which does not have enough shared memory for one of the configs
+    if (
+        not torch.cuda.get_device_capability() >= (8, 0)
+    ) or get_gpu_shared_memory() != 166912:
+        return None
+    if m == 1 and (n % 16 != 0 or k % 16 != 0):
+        return None
+    if m <= 16 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=16,
+            BLOCK_N=64,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    elif m > 16 and m <= 32 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=32,
+            BLOCK_N=32,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    elif m > 32 and m <= 64 and n >= 4096 and k >= 4096:
+        return triton_config(
+            BLOCK_M=64,
+            BLOCK_N=32,
+            BLOCK_K=128,
+            num_stages=5,
+            num_warps=4,
+        )
+    return None
+def mm_autoheuristic(
+    mat1,
+    mat2,
+    m,
+    n,
+    k,
+    choices,
+    name,
+    input_nodes,
+    ops,
+    precondition,
+    top_k: Optional[int] = None,
+    always_included=None,
+):
+    m, n, k = get_size_hints(mat1, mat2, m, n, k)
+    if not dims_are_int([m, n, k]):
+        return None
+    mat1_stride, mat2_stride = get_size_hints_strides(mat1, mat2)
+    def get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride):
+        context = AHContext()
+        context.add_feature("m", m)
+        context.add_feature("k", k)
+        context.add_feature("n", n)
+        context.add_feature("mat1_dtype", mat1.layout.dtype, is_categorical=True)
+        context.add_feature("mat2_dtype", mat2.layout.dtype, is_categorical=True)
+        context_add_strides(context, "mat1", mat1_stride)
+        context_add_strides(context, "mat2", mat2_stride)
+        context.add_feature(
+            "mat1_iscontig", mat1.layout.is_contiguous(), is_categorical=True
+        )
+        context.add_feature(
+            "mat2_iscontig", mat2.layout.is_contiguous(), is_categorical=True
+        )
+        if name == "mm":
+            # for mixed_mm, we only consider fp16
+            context_add_using_tf32(context, mat1.layout.dtype)
+        return context
+    def fallback():
+        return None
+    context = get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride)
+    autoheuristic = AutoHeuristicSelectAlgorithm(
+        fallback=fallback,
+        choices=choices,
+        input_nodes=input_nodes,
+        context=context,
+        name=name,
+        augment_context=ops,
+        precondition=precondition,
+    )
+    if top_k is not None:
+        # TODO: is there a cleaner way to ensure aten.mm is always included?
+        return autoheuristic.get_top_k_choices_caller(
+            top_k, always_included=always_included
+        )
+    return autoheuristic.get_choice_caller()
+def get_size_hints(mat1, mat2, m, n, k):
+    if not isinstance(m, int) or not isinstance(k, int):
+        (m, k) = V.graph.sizevars.size_hints(
+            mat1.get_size(),
+            fallback=torch._inductor.config.unbacked_symint_fallback,
+        )
+    if not isinstance(n, int) or not isinstance(k, int):
+        (k, n) = V.graph.sizevars.size_hints(
+            mat2.get_size(),
+            fallback=torch._inductor.config.unbacked_symint_fallback,
+        )
+    return m, n, k
+def get_size_hints_strides(mat1, mat2):
+    mat1_stride = mat1.layout.stride
+    mat2_stride = mat2.layout.stride
+    strides = [mat1_stride, mat2_stride]
+    strides_hints = []
+    for stride in strides:
+        if not isinstance(stride, int):
+            stride = V.graph.sizevars.size_hints(
+                stride,
+                fallback=torch._inductor.config.unbacked_symint_fallback,
+            )
+        strides_hints.append(stride)
+    return strides_hints[0], strides_hints[1]
+def tuned_mixed_mm(mat1, mat2, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None)
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    fallback = aten_fallback_mixed_mm.bind((mat1, mat2), layout)
+    choices = [fallback]
+    # can't use triton kernel unless one of these is true or if running on v100 (numerical issues)
+    skip_triton = (
+        (
+            mat1.layout.dtype != torch.float32
+            and not (mat2.layout.is_contiguous() or mat2.layout.is_transposed())
+        )
+        or _is_sm7x_or_older_gpu(layout.device.index)
+        or inductor_config.mixed_mm_choice == "aten"
+        or not V.graph.has_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
+        or (
+            mat1.layout.dtype == torch.float32 and torch.backends.cuda.matmul.allow_tf32
+        )
+        or (mat1.layout.dtype == torch.bfloat16 and mat2.layout.dtype == torch.uint8)
+    )
+    if inductor_config.mixed_mm_choice == "triton":
+        choices = []
+    if not skip_triton:
+        b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+        if static_shape and inductor_config.mixed_mm_choice == "heuristic":
+            choices = []
+            config = try_heuristic(m, n, k, choices, mat1, mat2, mat2_dtype, layout)
+            if config is not None:
+                mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2),
+                    layout=layout,
+                    **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+                )
+            choices.append(fallback)
+        has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2)
+        for config in mixed_mm_configs(m, n, k, has_int8_tensor=has_int8_tensor):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+            )
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+        CUTLASS2xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+    if skip_triton and not choices:
+        choices = [fallback]
+    name = "mixed_mm"
+    input_nodes = [mat1, mat2]
+    if torch._inductor.config.run_autoheuristic(name):
+        choice = mm_autoheuristic(
+            mat1,
+            mat2,
+            m,
+            n,
+            k,
+            choices,
+            name,
+            input_nodes,
+            mixed_mm_operations(),
+            get_mixedmm_precondition,
+        )
+        if (
+            not skip_triton
+            and inductor_config.mixed_mm_choice == "heuristic"
+            and choice is not None
+        ):
+            choices.insert(0, choice)
+    return autotune_select_algorithm(name, choices, input_nodes, layout)
+# This op is a special case of the int_mm op which we use based on the pattern
+# _int_mm -> mul (defined in ../fx_passes/post_grad.py) in order to prevent
+# realization of the int32 _int_mm output by forcing fusion with the mul op.
+# This is only used when config.force_fuse_int_mm_with_mul = True
+def tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype, *, layout=None):
+    out_dtype = (
+        torch.promote_types(mat3.get_dtype(), torch.int32)
+        if out_dtype is None
+        else out_dtype
+    )
+    m, n, k, layout, mat1, mat2, mat3 = mm_args(
+        mat1, mat2, mat3, layout=layout, out_dtype=out_dtype
+    )
+    choices: List[Dict[Any, Any]] = []
+    for config in int8_mm_configs(m, n, k):
+        mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2, mat3),
+            layout=layout,
+            **dict(mm_options(config, m, n, k, layout), ACC_TYPE="tl.int32"),
+            suffix_args=1,
+            epilogue_fn=V.ops.mul,
+        )
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2, mat3], layout)

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_common.py ADDED Viewed

	@@ -0,0 +1,466 @@

+# mypy: allow-untyped-defs
+import functools
+import itertools
+import logging
+from typing import cast, List, Tuple
+import sympy
+import torch
+from torch._inductor.select_algorithm import realize_inputs
+from torch._inductor.virtualized import V
+from .. import config as inductor_config
+from ..runtime.runtime_utils import next_power_of_2
+from ..utils import ceildiv as cdiv
+log = logging.getLogger(__name__)
+def triton_config(num_stages, num_warps, **kwargs):
+    from triton import Config
+    return Config(kwargs, num_stages=num_stages, num_warps=num_warps)
+def filtered_configs(
+    m: int,
+    n: int,
+    k: int,
+    configs: List[Tuple[int, int, int, int, int]],
+    has_int8_tensor=False,
+):
+    """Heuristic to shrink configs when they are bigger than the input size"""
+    min_block_size = 16
+    # block_k=16 seems to be causing issues
+    # see: https://github.com/triton-lang/triton/issues/2156#issuecomment-1695897424
+    min_block_size_k = 32 if has_int8_tensor else 16
+    m = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                m, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    n = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                n, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    k = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                k, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size_k,
+    )
+    used = set()
+    for block_m, block_n, block_k, num_stages, num_warps in configs:
+        # shrink configs for small sizes
+        block_m = max(min(block_m, m), min_block_size)
+        block_n = max(min(block_n, n), min_block_size)
+        block_k = max(min(block_k, k), min_block_size_k)
+        # each warp computes 16x16 tile = 256
+        num_warps = min(num_warps, block_m * block_n // 256)
+        if torch.version.hip:
+            for matrix_instr_nonkdim in [0, 16]:
+                if matrix_instr_nonkdim != 0 and (
+                    block_m % matrix_instr_nonkdim != 0
+                    or block_n % matrix_instr_nonkdim != 0
+                ):
+                    #  block_m and block_n must be a multiple of matrix_instr_nonkdim
+                    continue
+                if (
+                    block_m,
+                    block_n,
+                    block_k,
+                    num_stages,
+                    num_warps,
+                    matrix_instr_nonkdim,
+                ) not in used:
+                    used.add(
+                        (
+                            block_m,
+                            block_n,
+                            block_k,
+                            num_stages,
+                            num_warps,
+                            matrix_instr_nonkdim,
+                        )
+                    )
+                    yield triton_config(
+                        BLOCK_M=block_m,
+                        BLOCK_N=block_n,
+                        BLOCK_K=block_k,
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        matrix_instr_nonkdim=matrix_instr_nonkdim,
+                    )
+        else:
+            if (block_m, block_n, block_k, num_stages, num_warps, 0) not in used:
+                used.add((block_m, block_n, block_k, num_stages, num_warps, 0))
+                yield triton_config(
+                    BLOCK_M=block_m,
+                    BLOCK_N=block_n,
+                    BLOCK_K=block_k,
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                )
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform. The configs are as follows:
+# (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps)
+mm_kernel_configs = (
+    [
+        {"config": (32, 32, 16, 1, 2), "cond": True},
+        {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
+        {"config": (32, 64, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 128, 5, 4), "cond": True},
+        {"config": (64, 64, 16, 2, 4), "cond": True},
+        {"config": (64, 64, 32, 2, 4), "cond": True},
+        {"config": (64, 64, 64, 3, 8), "cond": True},
+        {"config": (64, 64, 128, 5, 4), "cond": True},
+        {"config": (64, 128, 32, 3, 4), "cond": True},
+        {"config": (64, 128, 32, 4, 8), "cond": True},
+        {"config": (64, 128, 64, 3, 4), "cond": True},
+        {"config": (64, 128, 128, 4, 4), "cond": True},
+        {"config": (128, 64, 32, 3, 4), "cond": True},
+        {"config": (128, 64, 32, 4, 8), "cond": True},
+        {"config": (128, 128, 32, 2, 8), "cond": True},
+        {"config": (128, 128, 32, 3, 4), "cond": True},
+        {"config": (128, 128, 64, 3, 4), "cond": True},
+        {"config": (128, 128, 64, 5, 8), "cond": True},
+    ]
+    if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE"
+    else [
+        {"config": (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps), "cond": True}
+        for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+            [16, 32, 64, 128, 256], repeat=3
+        )
+        for num_stages in [1, 2, 3, 4, 5]
+        for num_warps in [2, 4, 8]
+    ]
+)
+# these are only used in tuned_mm when AutoHeuristic is enabled
+# the idea is that when AutoHeuristic collects data to learn a heuristic, more configs are autotuned
+# when the learned heuristic is used, the learned heuristic reduces the number of configs down to 10
+# which saves compilation time (since less configs are autotuned) and potentially increase performance
+# because the learned heuristic might predict a config that is not part mm_configs
+extra_mm_kernel_configs = [
+    {"config": (16, 32, 16, 3, 2), "cond": True},
+    {"config": (16, 32, 32, 4, 2), "cond": True},
+    {"config": (16, 32, 32, 5, 2), "cond": True},
+    {"config": (64, 64, 128, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 2, 2), "cond": True},
+    {"config": (128, 64, 64, 3, 8), "cond": True},
+    {"config": (128, 64, 128, 4, 8), "cond": True},
+    {"config": (128, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 128, 64, 3, 8), "cond": True},
+    {"config": (128, 128, 64, 5, 4), "cond": True},
+]
+int8_mm_kernel_configs = [
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 128, 32, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 3, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 8), "cond": True},
+    {"config": (128, 64, 32, 4, 8), "cond": True},
+    {"config": (64, 32, 32, 5, 8), "cond": True},
+    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 64, 3, 8), "cond": True},
+    # {"config": (32, 32, 128, 2, 4), "cond": True},
+    # {"config": (64, 64, 16, 2, 4), "cond": True},
+    # {"config": (32, 32, 16, 1, 2), "cond": True},
+    {"config": (128, 256, 128, 3, 8), "cond": torch.version.hip is None},
+    {"config": (256, 128, 128, 3, 8), "cond": torch.version.hip is None},
+]
+# Mixed precision kernel configs for small sizes of m for mm's like (16, 8192) x (8192, 8192).
+mixed_mm_kernel_configs_small_m = [
+    {"config": (16, 128, 256, 3, 4), "cond": True},
+    {"config": (16, 128, 256, 5, 8), "cond": True},
+]
+mixed_mm_kernel_configs = (
+    mm_kernel_configs + mixed_mm_kernel_configs_small_m
+    if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE"
+    else mm_kernel_configs
+)
+scaled_mm_kernel_configs = [
+    {"config": (128, 256, 32, 3, 8), "cond": True},
+    {"config": (256, 128, 32, 3, 8), "cond": True},
+    {"config": (256, 64, 32, 4, 4), "cond": True},
+    {"config": (64, 256, 32, 4, 4), "cond": True},
+    {"config": (128, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 64, 32, 4, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 32, 32, 4, 4), "cond": True},
+    {"config": (64, 32, 32, 5, 2), "cond": True},
+    {"config": (256, 128, 128, 3, 8), "cond": True},
+    {"config": (256, 64, 128, 4, 4), "cond": True},
+    {"config": (64, 256, 128, 4, 4), "cond": True},
+    {"config": (128, 128, 128, 4, 4), "cond": True},
+    {"config": (128, 64, 64, 4, 4), "cond": True},
+    {"config": (64, 128, 64, 4, 4), "cond": True},
+    {"config": (128, 32, 64, 4, 4), "cond": True},
+    {"config": (64, 32, 64, 5, 2), "cond": True},
+    {"config": (16, 32, 32, 2, 2), "cond": True},
+    {"config": (16, 64, 32, 2, 2), "cond": True},
+    {"config": (16, 128, 32, 2, 4), "cond": True},
+    {"config": (16, 256, 32, 2, 4), "cond": True},
+    {"config": (16, 32, 64, 2, 2), "cond": True},
+    {"config": (16, 64, 64, 2, 2), "cond": True},
+    {"config": (16, 128, 64, 2, 4), "cond": True},
+    {"config": (16, 256, 64, 2, 4), "cond": True},
+    {"config": (32, 32, 32, 2, 2), "cond": True},
+    {"config": (32, 64, 32, 2, 2), "cond": True},
+    {"config": (32, 128, 32, 2, 4), "cond": True},
+    {"config": (32, 256, 32, 2, 4), "cond": True},
+    {"config": (32, 32, 64, 2, 2), "cond": True},
+    {"config": (32, 64, 64, 2, 2), "cond": True},
+    {"config": (32, 128, 64, 2, 4), "cond": True},
+    {"config": (32, 256, 64, 2, 4), "cond": True},
+    {"config": (16, 32, 32, 3, 2), "cond": True},
+    {"config": (16, 64, 32, 3, 2), "cond": True},
+    {"config": (16, 128, 32, 3, 4), "cond": True},
+    {"config": (16, 256, 32, 3, 4), "cond": True},
+    {"config": (16, 32, 64, 3, 2), "cond": True},
+    {"config": (16, 64, 64, 3, 2), "cond": True},
+    {"config": (16, 128, 64, 3, 4), "cond": True},
+    {"config": (16, 256, 64, 3, 4), "cond": True},
+    {"config": (32, 32, 32, 3, 2), "cond": True},
+    {"config": (32, 64, 32, 3, 2), "cond": True},
+    {"config": (32, 128, 32, 3, 4), "cond": True},
+    {"config": (32, 256, 32, 3, 4), "cond": True},
+    {"config": (32, 32, 64, 3, 2), "cond": True},
+    {"config": (32, 64, 64, 3, 2), "cond": True},
+    {"config": (32, 128, 64, 3, 4), "cond": True},
+    {"config": (32, 256, 64, 3, 4), "cond": True},
+    {"config": (16, 32, 32, 4, 2), "cond": True},
+    {"config": (16, 64, 32, 4, 2), "cond": True},
+    {"config": (16, 128, 32, 4, 4), "cond": True},
+    {"config": (16, 256, 32, 4, 4), "cond": True},
+    {"config": (16, 32, 64, 4, 2), "cond": True},
+    {"config": (16, 64, 64, 4, 2), "cond": True},
+    {"config": (16, 128, 64, 4, 4), "cond": True},
+    {"config": (16, 256, 64, 4, 4), "cond": True},
+    {"config": (32, 32, 32, 4, 2), "cond": True},
+    {"config": (32, 64, 32, 4, 2), "cond": True},
+    {"config": (32, 128, 32, 4, 4), "cond": True},
+    {"config": (32, 256, 32, 4, 4), "cond": True},
+    {"config": (32, 32, 64, 4, 2), "cond": True},
+    {"config": (32, 64, 64, 4, 2), "cond": True},
+    {"config": (32, 128, 64, 4, 4), "cond": True},
+    {"config": (32, 256, 64, 4, 4), "cond": True},
+    {"config": (16, 32, 32, 5, 2), "cond": True},
+    {"config": (16, 64, 32, 5, 2), "cond": True},
+    {"config": (16, 128, 32, 5, 4), "cond": True},
+    {"config": (16, 256, 32, 5, 4), "cond": True},
+    {"config": (16, 32, 64, 5, 2), "cond": True},
+    {"config": (16, 64, 64, 5, 2), "cond": True},
+    {"config": (16, 128, 64, 5, 4), "cond": True},
+    {"config": (16, 256, 64, 5, 4), "cond": True},
+    {"config": (32, 32, 32, 5, 2), "cond": True},
+    {"config": (32, 64, 32, 5, 2), "cond": True},
+    {"config": (32, 128, 32, 5, 4), "cond": True},
+    {"config": (32, 256, 32, 5, 4), "cond": True},
+    {"config": (32, 32, 64, 5, 2), "cond": True},
+    {"config": (32, 64, 64, 5, 2), "cond": True},
+    {"config": (32, 128, 64, 5, 4), "cond": True},
+    {"config": (32, 256, 64, 5, 4), "cond": True},
+    {"config": (16, 32, 32, 6, 2), "cond": True},
+    {"config": (16, 64, 32, 6, 2), "cond": True},
+    {"config": (16, 128, 32, 6, 4), "cond": True},
+    {"config": (16, 256, 32, 6, 4), "cond": True},
+    {"config": (16, 32, 64, 6, 2), "cond": True},
+    {"config": (16, 64, 64, 6, 2), "cond": True},
+    {"config": (16, 128, 64, 6, 4), "cond": True},
+    {"config": (16, 256, 64, 6, 4), "cond": True},
+    {"config": (32, 32, 32, 6, 2), "cond": True},
+    {"config": (32, 64, 32, 6, 2), "cond": True},
+    {"config": (32, 128, 32, 6, 4), "cond": True},
+    {"config": (32, 256, 32, 6, 4), "cond": True},
+    {"config": (32, 32, 64, 6, 2), "cond": True},
+    {"config": (32, 64, 64, 6, 2), "cond": True},
+    {"config": (32, 128, 64, 6, 4), "cond": True},
+    {"config": (32, 256, 64, 6, 4), "cond": True},
+]
+# Create filtered list of configs based on cond evaluation
+mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in mm_kernel_configs
+    if config["cond"]
+)
+extra_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in extra_mm_kernel_configs
+    if config["cond"]
+)
+int8_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in int8_mm_kernel_configs
+    if config["cond"]
+)
+mixed_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in mixed_mm_kernel_configs
+    if config["cond"]
+)
+scaled_mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in scaled_mm_kernel_configs
+    if config["cond"]
+)
+# On ROCm convert num_stages to 0 to enable software pipelining
+if torch.version.hip:
+    mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mm_platform_configs
+    )
+    extra_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in extra_mm_platform_configs
+    )
+    int8_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mm_platform_configs
+    )
+    mixed_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in mixed_mm_platform_configs
+    )
+    scaled_mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 0, config[4])
+        for config in scaled_mm_platform_configs
+    )
+mm_configs = functools.partial(
+    filtered_configs,
+    configs=mm_platform_configs,
+)
+extra_mm_configs = functools.partial(
+    filtered_configs,
+    configs=extra_mm_platform_configs,
+)
+int8_mm_configs = functools.partial(
+    filtered_configs,
+    configs=int8_platform_configs,
+)
+mixed_mm_configs = functools.partial(
+    filtered_configs,
+    configs=mixed_mm_platform_configs,
+)
+scaled_mm_configs = functools.partial(
+    filtered_configs,
+    configs=scaled_mm_platform_configs,
+)
+def mm_grid(m, n, meta):
+    """
+    The CUDA grid size for matmul triton templates.
+    """
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), 1, 1)
+def acc_type(dtype):
+    if dtype in (torch.float16, torch.bfloat16):
+        return "tl.float32"
+    return f"tl.{dtype}".replace("torch.", "")
+def mm_options(config, sym_m, sym_n, sym_k, layout, b_prologue_cast_type=None):
+    """
+    Common options to matmul triton templates.
+    """
+    even_k_symbolic = (
+        # it isn't worth guarding on this
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"])
+        == config.kwargs["BLOCK_K"]
+    )
+    allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+        not inductor_config.force_same_precision
+        or ((sym_m % 16) == 0 and (sym_n % 16) == 0 and (sym_k % 8) == 0)
+    )
+    return dict(
+        GROUP_M=8,
+        EVEN_K=even_k_symbolic,
+        ALLOW_TF32=allow_tf32,
+        ACC_TYPE=acc_type(layout.dtype),
+        B_PROLOGUE_CAST_TYPE=b_prologue_cast_type,
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        **config.kwargs,
+    )
+def mm_args(
+    mat1,
+    mat2,
+    *others,
+    layout=None,
+    out_dtype=None,
+    use_4x2_dim=False,
+    mat2_transposed=False,
+):
+    """
+    Common arg processing for mm,bmm,addmm,etc
+    """
+    mat1, mat2 = realize_inputs(mat1, mat2)
+    *b1, m, k1 = mat1.get_size()
+    if mat2_transposed:
+        *b2, n, k2 = mat2.get_size()
+    else:
+        *b2, k2, n = mat2.get_size()
+    b = [V.graph.sizevars.guard_equals(a, b) for a, b in zip(b1, b2)]
+    if use_4x2_dim:
+        k2 = k2 * 2
+    k = V.graph.sizevars.guard_equals(k1, k2)
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
+        layout = FixedLayout(
+            mat1.get_device(),
+            out_dtype,
+            [*b, m, n],
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+    from ..lowering import expand
+    others = [realize_inputs(expand(x, layout.size)) for x in others]
+    return [m, n, k, layout, mat1, mat2, *others]
+def addmm_epilogue(dtype, alpha, beta):
+    def epilogue(acc, bias):
+        if alpha != 1:
+            acc = V.ops.mul(acc, V.ops.constant(alpha, dtype))
+        if beta != 1:
+            bias = V.ops.mul(bias, V.ops.constant(beta, dtype))
+        return V.ops.add(acc, bias)
+    return epilogue

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_plus_mm.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# mypy: allow-untyped-defs
+import functools
+import torch
+from ..lowering import lowerings
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import use_aten_gemm_kernels, use_triton_template
+from ..virtualized import V
+from .mm_common import mm_args, mm_grid, mm_options
+aten = torch.ops.aten
+aten_mm_plus_mm = ExternKernelChoice(
+    torch.ops.inductor._mm_plus_mm, "torch::inductor::_mm_plus_mm"
+)
+mm_plus_mm_template = TritonTemplate(
+    name="mm_plus_mm",
+    grid=mm_grid,
+    debug=False,
+    source=r"""
+{{def_kernel("A", "B", "C", "D")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K1 = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    # K2 = {{size("C", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    stride_cm = {{stride("C", 0)}}
+    stride_ck = {{stride("C", 1)}}
+    stride_dk = {{stride("D", 0)}}
+    stride_dn = {{stride("D", 1)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    if (((stride_am == 1 and stride_ak == M) or (stride_am == K1 and stride_ak == 1))
+        and ((stride_cm == 1 and stride_ck == M) or (stride_cm == K1 and stride_ck == 1))):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (((stride_bk == 1 and stride_bn == K1) or (stride_bk == N and stride_bn == 1))
+        and ((stride_dk == 1 and stride_dn == K1) or (stride_dk == N and stride_dn == 1))):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    C = C + (ram[:, None] * stride_cm + rk[None, :] * stride_ck)
+    D = D + (rk[:, None] * stride_dk + rbn[None, :] * stride_dn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k1 in range(K1, 0, -BLOCK_K):
+        # First matmul with A @ B
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k1, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k1, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    for k2 in range(K1, 0, -BLOCK_K):
+        # Second matmul with C @ D
+        if EVEN_K:
+            c = tl.load(C)
+            d = tl.load(D)
+        else:
+            c = tl.load(C, mask=rk[None, :] < k2, other=0.)
+            d = tl.load(D, mask=rk[:, None] < k2, other=0.)
+        acc += tl.dot(c, d, allow_tf32=ALLOW_TF32)
+        C += BLOCK_K * stride_ck
+        D += BLOCK_K * stride_dk
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+@functools.lru_cache(None)
+def mm_configs():
+    import triton
+    # List of dictionaries to store the kernel configs. Configs that evaluate to true
+    # will be utilised on the target platform
+    mm_triton_configs = [
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 3,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 16,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": torch.version.hip is None,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16},
+            "num_stages": 1,
+            "num_warps": 2,
+            "cond": True,
+        },
+    ]
+    # Filter out configs in which cond evaluates to true
+    # On ROCm convert num_stages to 1 as pipelining provides no benefit
+    if torch.version.hip:
+        filtered_configs = [
+            triton.Config(c["config"], num_stages=1, num_warps=c["num_warps"])
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+    else:
+        filtered_configs = [
+            triton.Config(
+                c["config"], num_stages=c["num_stages"], num_warps=c["num_warps"]
+            )
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+    return filtered_configs
+def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
+    """
+    Computes mm(mat1, mat2) + mm(mat3, mat4)
+    """
+    m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+    # Optimization is optional, because we can always just not do the fusion
+    if (
+        m1 * n1 == 0
+        or m2 * n2 == 0
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat1.get_size(), mat3.get_size()
+        )
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat2.get_size(), mat4.get_size()
+        )
+    ):
+        # TODO(jansel): support different K values when this is fixed:
+        # https://github.com/openai/triton/issues/967
+        return lowerings[aten.add](
+            lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
+        )
+    assert layout1 == layout2
+    # options to tune from
+    choices = (
+        [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout1)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout1):
+        for config in mm_configs():
+            # see https://github.com/openai/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
+                mm_plus_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2, mat3, mat4),
+                    layout=layout1,
+                    **mm_options(config, m1, n1, k1, layout1),
+                )
+    return autotune_select_algorithm(
+        "mm_plus_mm", choices, [mat1, mat2, mat3, mat4], layout1
+    )

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/mm_scaled.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import logging
+from typing import Any, Dict, List, Optional, Tuple
+import sympy
+import torch
+from .. import config as inductor_config
+from ..ir import ChoiceCaller, Layout, StorageBox, TensorBox
+from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    NoValidChoicesError,
+    realize_inputs,
+    TritonTemplate,
+)
+from ..utils import use_aten_gemm_kernels, use_triton_template
+from .mm import _is_static_problem  # TODO(yangsiyu) move to mm_common
+from .mm_common import mm_args, mm_grid, scaled_mm_configs
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+scaled_mm_template = TritonTemplate(
+    name="scaled_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        if USE_FAST_ACCUM:
+            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
+        else:
+            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    if SCALING_ROWWISE:
+        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
+        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
+        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
+        acc *= inv_scale_row
+    else:
+        # for tensor-wise scaling, the scales are scalars
+        inv_a_scale = tl.load(A_inverse_scale)
+        inv_b_scale = tl.load(B_inverse_scale)
+        inv_scale = inv_a_scale * inv_b_scale
+        acc *= inv_scale
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+# Inductor does not allow optional tensor input arguments currently (pass None as an
+# input node to template choices), but since for _scaled_mm there is only one such arg
+# (bias), work around by having a second template when bias is provided.
+scaled_mm_bias_template = TritonTemplate(
+    name="scaled_mm_bias",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale", "bias_ptr")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        if USE_FAST_ACCUM:
+            acc = tl.dot(a, b, acc, out_dtype=ACC_TYPE)
+        else:
+            acc += tl.dot(a, b, out_dtype=ACC_TYPE)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    if SCALING_ROWWISE:
+        inv_a_scale_row = tl.load(A_inverse_scale + rm, mask=rm < M)
+        inv_b_scale_row = tl.load(B_inverse_scale + rn, mask=rn < N)
+        inv_scale_row = inv_a_scale_row[:, None] * inv_b_scale_row[None, :]
+        acc *= inv_scale_row
+    else:
+        # for tensor-wise scaling, the scales are scalars
+        inv_a_scale = tl.load(A_inverse_scale)
+        inv_b_scale = tl.load(B_inverse_scale)
+        inv_scale = inv_a_scale * inv_b_scale
+        acc *= inv_scale
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    # bias
+    bias = tl.load(bias_ptr + rn, mask=rn < N)
+    acc += bias
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+aten__fp8_mm = ExternKernelChoice(torch._scaled_mm, "at::_scaled_mm")
+def are_compatible_scales(size_a: List[int], size_b: List[int]) -> bool:
+    # Same sized scales are compatable
+    if len(size_a) == len(size_b):
+        return True
+    # Both need to be scalars or len(1) tensors
+    if len(size_a) <= 1 and len(size_b) <= 1:
+        return True
+    return False
+def scaled_mm_options(  # type: ignore[no-untyped-def]
+    config,  # triton.Config
+    sym_m: sympy.core.numbers.Integer,
+    sym_n: sympy.core.numbers.Integer,
+    sym_k: sympy.core.numbers.Integer,
+    layout: Layout,
+    scale_a: StorageBox,
+    scale_b: StorageBox,
+    use_fast_accum: bool,
+    b_prologue_cast_type: Optional[str] = None,
+) -> Dict[str, Any]:
+    even_k_symbolic = (
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"]) == config.kwargs["BLOCK_K"]
+    )
+    size_a, size_b = scale_a.get_size(), scale_b.get_size()
+    assert are_compatible_scales(size_a, size_b), (
+        "Expect scale_a and scale_b to be either both scalars (including single-element tensors) "
+        f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
+    )
+    return dict(
+        GROUP_M=8,
+        EVEN_K=even_k_symbolic,
+        ACC_TYPE="tl.float32",
+        B_PROLOGUE_CAST_TYPE=b_prologue_cast_type,
+        USE_FAST_ACCUM=use_fast_accum,
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        # tensor-wise scaling if scalar scales
+        SCALING_ROWWISE=len(scale_a.get_size()) == 2,
+        **config.kwargs,
+    )
+add_layout_constraint(aten._scaled_mm.default, constrain_to_fx_strides)
+@register_lowering(aten._scaled_mm.default, type_promotion_kind=None)  # type: ignore[misc]
+def tuned_scaled_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: TensorBox,
+    scale_b: TensorBox,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    m, n, k, layout, mat_a, mat_b = mm_args(
+        mat_a, mat_b, layout=layout, out_dtype=out_dtype
+    )
+    scale_a, scale_b = realize_inputs(scale_a, scale_b)
+    input_nodes: Tuple[Any, ...]
+    # workaround for Inductor not supporting optional tensor input arguments
+    if bias is None:
+        input_nodes = (mat_a, mat_b, scale_a, scale_b)
+        triton_template = scaled_mm_template
+    else:
+        bias = realize_inputs(bias)
+        input_nodes = (mat_a, mat_b, scale_a, scale_b, bias)
+        triton_template = scaled_mm_bias_template
+    aten_choice = aten__fp8_mm.bind(
+        input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum
+    )
+    choices: List[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.append(aten_choice)
+    static_shape, is_nonzero = _is_static_problem([mat_a, mat_b], layout)
+    if is_nonzero and use_triton_template(layout, enable_float8=True):
+        for config in scaled_mm_configs(m, n, k):
+            if k == 16 and config.kwargs["BLOCK_M"] >= 64:
+                continue  # Triton crashes in this case
+            kwargs = scaled_mm_options(
+                config, m, n, k, layout, scale_a, scale_b, use_fast_accum
+            )
+            # possibly appends a TritonTemplateCaller to choices
+            triton_template.maybe_append_choice(
+                choices,
+                input_nodes=input_nodes,
+                layout=layout,
+                **kwargs,
+            )
+    if (
+        len(choices) == 0
+        and not use_aten_gemm_kernels()
+        and inductor_config.autotune_fallback_to_aten
+    ):
+        log.warning("No choices for scaled_mm, using ATen backend as fallback")
+        return aten_choice.output_node()
+    try:
+        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+    except NoValidChoicesError:
+        if not inductor_config.autotune_fallback_to_aten:
+            raise
+        log.warning(
+            "All choices for scaled_mm were invalid, using ATen backend as fallback"
+        )
+        return aten_choice.output_node()

.venv/lib/python3.11/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# mypy: allow-untyped-defs
+import logging
+from typing import List, TYPE_CHECKING
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
+from .mm_common import mm_args, mm_configs, mm_grid, mm_options
+if TYPE_CHECKING:
+    from ..ir import ChoiceCaller
+log = logging.getLogger(__name__)
+uint4x2_mixed_mm_template = TritonTemplate(
+    name="uint4x2_mixed_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn)
+    b_shifts = 4*(rk%2)
+    b_subs = 8*(1-(rk%2))
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        b = ((b >> b_shifts[:, None]) & 0xF) - 8
+        b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K//2 * stride_bk
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True)
+    choices: List[ChoiceCaller] = []
+    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+    for config in mm_configs(m, n, k):
+        uint4x2_mixed_mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2),
+            layout=layout,
+            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+        )
+    return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout)

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (196 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/autotune_cache.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/benchmarking.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/compile_tasks.cpython-311.pyc ADDED Viewed

Binary file (4.34 kB). View file

.venv/lib/python3.11/site-packages/torch/_inductor/runtime/__pycache__/coordinate_descent_tuner.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file