danieldk HF Staff commited on 23 days ago

Commit

4298e26

verified ·

1 Parent(s): 828edc5

Build uploaded using `kernels`.

Browse files

Files changed (44) hide show

build/torch-cuda/__init__.py +24 -0
build/torch-cuda/_ops.py +8 -0
build/torch-cuda/ampere_helpers.py +103 -0
build/torch-cuda/barrier.py +71 -0
build/torch-cuda/benchmark.py +268 -0
build/torch-cuda/blackwell_helpers.py +1089 -0
build/torch-cuda/block_info.py +108 -0
build/torch-cuda/block_sparse_utils.py +1476 -0
build/torch-cuda/block_sparsity.py +440 -0
build/torch-cuda/cache_utils.py +307 -0
build/torch-cuda/compute_block_sparsity.py +378 -0
build/torch-cuda/copy_utils.py +372 -0
build/torch-cuda/cute_dsl_ptxas.py +151 -0
build/torch-cuda/cute_dsl_utils.py +167 -0
build/torch-cuda/fast_math.py +21 -0
build/torch-cuda/flash_attn4/__init__.py +26 -0
build/torch-cuda/flash_bwd.py +1264 -0
build/torch-cuda/flash_bwd_postprocess.py +585 -0
build/torch-cuda/flash_bwd_preprocess.py +361 -0
build/torch-cuda/flash_bwd_sm100.py +0 -0
build/torch-cuda/flash_bwd_sm90.py +1591 -0
build/torch-cuda/flash_fwd.py +0 -0
build/torch-cuda/flash_fwd_combine.py +692 -0
build/torch-cuda/flash_fwd_sm100.py +0 -0
build/torch-cuda/interface.py +1855 -0
build/torch-cuda/mask.py +653 -0
build/torch-cuda/metadata.json +8 -0
build/torch-cuda/mma_sm100_desc.py +296 -0
build/torch-cuda/named_barrier.py +32 -0
build/torch-cuda/pack_gqa.py +165 -0
build/torch-cuda/paged_kv.py +214 -0
build/torch-cuda/pipeline.py +440 -0
build/torch-cuda/quack/__init__.py +0 -0
build/torch-cuda/quack/activation.py +568 -0
build/torch-cuda/quack/compile_utils.py +19 -0
build/torch-cuda/quack/copy_utils.py +1007 -0
build/torch-cuda/quack/cute_dsl_utils.py +165 -0
build/torch-cuda/quack/layout_utils.py +297 -0
build/torch-cuda/quack/sm90_utils.py +161 -0
build/torch-cuda/seqlen_info.py +138 -0
build/torch-cuda/softmax.py +592 -0
build/torch-cuda/testing.py +456 -0
build/torch-cuda/tile_scheduler.py +727 -0
build/torch-cuda/utils.py +698 -0

build/torch-cuda/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Flash Attention CUTE (CUDA Template Engine) implementation."""
+from importlib.metadata import PackageNotFoundError, version
+# Update when syncing again.
+__version__ = "4.0.0.beta4"
+import cutlass.cute as cute
+from .interface import (
+    flash_attn_func,
+    flash_attn_varlen_func,
+)
+from .cute_dsl_utils import cute_compile_patched
+# Patch cute.compile to optionally dump SASS
+cute.compile = cute_compile_patched
+__all__ = [
+    "flash_attn_func",
+    "flash_attn_varlen_func",
+]

build/torch-cuda/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._flash_attn4_c07a63b
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flash_attn4_c07a63b::{op_name}"

build/torch-cuda/ampere_helpers.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Type, Callable, Optional
+import cutlass
+import cutlass.cute as cute
+def get_smem_layout_atom(dtype: Type[cutlass.Numeric], k_dim: int) -> cute.ComposedLayout:
+    dtype_byte = cutlass.const_expr(dtype.width // 8)
+    bytes_per_row = cutlass.const_expr(k_dim * dtype_byte)
+    smem_k_block_size = (
+        cutlass.const_expr(
+            128
+            if bytes_per_row % 128 == 0
+            else (64 if bytes_per_row % 64 == 0 else (32 if bytes_per_row % 32 == 0 else 16))
+        )
+        // dtype_byte
+    )
+    swizzle_bits = (
+        4
+        if smem_k_block_size == 128
+        else (3 if smem_k_block_size == 64 else (2 if smem_k_block_size == 32 else 1))
+    )
+    swizzle_base = 2 if dtype_byte == 4 else (3 if dtype_byte == 2 else 4)
+    return cute.make_composed_layout(
+        cute.make_swizzle(swizzle_bits, swizzle_base, swizzle_base),
+        0,
+        cute.make_ordered_layout(
+            (8 if cutlass.const_expr(k_dim % 32 == 0) else 16, smem_k_block_size), order=(1, 0)
+        ),
+    )
+@cute.jit
+def gemm(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    tCsA: cute.Tensor,
+    tCsB: cute.Tensor,
+    smem_thr_copy_A: cute.TiledCopy,
+    smem_thr_copy_B: cute.TiledCopy,
+    hook_fn: Optional[Callable] = None,
+    A_in_regs: cutlass.Constexpr[bool] = False,
+    B_in_regs: cutlass.Constexpr[bool] = False,
+    swap_AB: cutlass.Constexpr[bool] = False,
+) -> None:
+    if cutlass.const_expr(swap_AB):
+        gemm(
+            tiled_mma,
+            acc,
+            tCrB,
+            tCrA,
+            tCsB,
+            tCsA,
+            smem_thr_copy_B,
+            smem_thr_copy_A,
+            hook_fn,
+            A_in_regs=B_in_regs,
+            B_in_regs=A_in_regs,
+            swap_AB=False,
+        )
+    else:
+        tCrA_copy_view = smem_thr_copy_A.retile(tCrA)
+        tCrB_copy_view = smem_thr_copy_B.retile(tCrB)
+        if cutlass.const_expr(not A_in_regs):
+            cute.copy(smem_thr_copy_A, tCsA[None, None, 0], tCrA_copy_view[None, None, 0])
+        if cutlass.const_expr(not B_in_regs):
+            cute.copy(smem_thr_copy_B, tCsB[None, None, 0], tCrB_copy_view[None, None, 0])
+        for k in cutlass.range_constexpr(cute.size(tCsA.shape[2])):
+            if k < cute.size(tCsA.shape[2]) - 1:
+                if cutlass.const_expr(not A_in_regs):
+                    cute.copy(
+                        smem_thr_copy_A, tCsA[None, None, k + 1], tCrA_copy_view[None, None, k + 1]
+                    )
+                if cutlass.const_expr(not B_in_regs):
+                    cute.copy(
+                        smem_thr_copy_B, tCsB[None, None, k + 1], tCrB_copy_view[None, None, k + 1]
+                    )
+            cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+            if cutlass.const_expr(k == 0 and hook_fn is not None):
+                hook_fn()
+@cute.jit
+def gemm_rs(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    tCsB: cute.Tensor,
+    smem_thr_copy_B: cute.TiledCopy,
+    hook_fn: Optional[Callable] = None,
+) -> None:
+    tCrB_copy_view = smem_thr_copy_B.retile(tCrB)
+    cute.copy(smem_thr_copy_B, tCsB[None, None, 0], tCrB_copy_view[None, None, 0])
+    for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+        if cutlass.const_expr(k < cute.size(tCrA.shape[2]) - 1):
+            cute.copy(smem_thr_copy_B, tCsB[None, None, k + 1], tCrB_copy_view[None, None, k + 1])
+        cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+        if cutlass.const_expr(k == 0 and hook_fn is not None):
+            hook_fn()

build/torch-cuda/barrier.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+@dsl_user_op
+def ld_acquire(lock_ptr: cute.Pointer, *, loc=None, ip=None) -> cutlass.Int32:
+    lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
+    state = llvm.inline_asm(
+        T.i32(),
+        [lock_ptr_i64],
+        "ld.global.acquire.gpu.b32 $0, [$1];",
+        "=r,l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+    return cutlass.Int32(state)
+@dsl_user_op
+def red_relaxed(
+    lock_ptr: cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None
+) -> None:
+    lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [lock_ptr_i64, Int32(val).ir_value(loc=loc, ip=ip)],
+        "red.relaxed.gpu.global.add.s32 [$0], $1;",
+        "l,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def red_release(
+    lock_ptr: cute.Pointer, val: cutlass.Constexpr[Int32], *, loc=None, ip=None
+) -> None:
+    lock_ptr_i64 = lock_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [lock_ptr_i64, Int32(val).ir_value(loc=loc, ip=ip)],
+        "red.release.gpu.global.add.s32 [$0], $1;",
+        "l,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@cute.jit
+def wait_eq(lock_ptr: cute.Pointer, thread_idx: int | Int32, flag_offset: int, val: Int32) -> None:
+    flag_ptr = lock_ptr + flag_offset
+    if thread_idx == 0:
+        read_val = Int32(0)
+        while read_val != val:
+            read_val = ld_acquire(flag_ptr)
+@cute.jit
+def arrive_inc(
+    lock_ptr: cute.Pointer, thread_idx: int | Int32, flag_offset: int, val: cutlass.Constexpr[Int32]
+) -> None:
+    flag_ptr = lock_ptr + flag_offset
+    if thread_idx == 0:
+        red_release(flag_ptr, val)
+        # red_relaxed(flag_ptr, val)

build/torch-cuda/benchmark.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) 2023, Tri Dao.
+"""Useful functions for writing test code."""
+import torch
+import torch.utils.benchmark as benchmark
+def benchmark_forward(
+    fn, *inputs, repeats=10, desc="", verbose=True, amp=False, amp_dtype=torch.float16, **kwinputs
+):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Forward pass")
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            fn(*inputs, **kwinputs)
+    t = benchmark.Timer(
+        stmt="fn_amp(*inputs, **kwinputs)",
+        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+def benchmark_backward(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the backward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Backward pass")
+    with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+        y = fn(*inputs, **kwinputs)
+        if type(y) is tuple:
+            y = y[0]
+    if grad is None:
+        grad = torch.randn_like(y)
+    else:
+        if grad.shape != y.shape:
+            raise RuntimeError("Grad shape does not match output shape")
+    def f(*inputs, y, grad):
+        # Set .grad to None to avoid extra operation of gradient accumulation
+        for x in inputs:
+            if isinstance(x, torch.Tensor):
+                x.grad = None
+        y.backward(grad, retain_graph=True)
+    t = benchmark.Timer(
+        stmt="f(*inputs, y=y, grad=grad)",
+        globals={"f": f, "inputs": inputs, "y": y, "grad": grad},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+def benchmark_combined(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Forward + Backward pass")
+    with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+        y = fn(*inputs, **kwinputs)
+        if type(y) is tuple:
+            y = y[0]
+    if grad is None:
+        grad = torch.randn_like(y)
+    else:
+        if grad.shape != y.shape:
+            raise RuntimeError("Grad shape does not match output shape")
+    def f(grad, *inputs, **kwinputs):
+        for x in inputs:
+            if isinstance(x, torch.Tensor):
+                x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            y = fn(*inputs, **kwinputs)
+            if type(y) is tuple:
+                y = y[0]
+        y.backward(grad, retain_graph=True)
+    t = benchmark.Timer(
+        stmt="f(grad, *inputs, **kwinputs)",
+        globals={"f": f, "fn": fn, "inputs": inputs, "grad": grad, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+def benchmark_fwd_bwd(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    return (
+        benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_backward(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+    )
+def benchmark_all(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    return (
+        benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_backward(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_combined(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+    )
+def pytorch_profiler(
+    fn,
+    *inputs,
+    trace_filename=None,
+    backward=False,
+    amp=False,
+    amp_dtype=torch.float16,
+    cpu=False,
+    verbose=True,
+    **kwinputs,
+):
+    """Wrap benchmark functions in Pytorch profiler to see CUDA information."""
+    if backward:
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+            g = torch.randn_like(out)
+    for _ in range(30):  # Warm up
+        if backward:
+            for x in inputs:
+                if isinstance(x, torch.Tensor):
+                    x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+        # Backward should be done outside autocast
+        if backward:
+            out.backward(g, retain_graph=True)
+    activities = ([torch.profiler.ProfilerActivity.CPU] if cpu else []) + [
+        torch.profiler.ProfilerActivity.CUDA
+    ]
+    with torch.profiler.profile(
+        activities=activities,
+        record_shapes=True,
+        # profile_memory=True,
+        with_stack=True,
+    ) as prof:
+        if backward:
+            for x in inputs:
+                if isinstance(x, torch.Tensor):
+                    x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+        if backward:
+            out.backward(g, retain_graph=True)
+    if verbose:
+        # print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=50))
+        print(prof.key_averages().table(row_limit=50))
+    if trace_filename is not None:
+        prof.export_chrome_trace(trace_filename)
+def benchmark_memory(fn, *inputs, desc="", verbose=True, **kwinputs):
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.synchronize()
+    fn(*inputs, **kwinputs)
+    torch.cuda.synchronize()
+    mem = torch.cuda.max_memory_allocated() / ((2**20) * 1000)
+    if verbose:
+        print(f"{desc} max memory: {mem}GB")
+    torch.cuda.empty_cache()
+    return mem

build/torch-cuda/blackwell_helpers.py ADDED Viewed

	@@ -0,0 +1,1089 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Boolean, const_expr
+from cutlass.cute.nvgpu import tcgen05
+from cutlass._mlir.dialects import llvm
+from . import mma_sm100_desc as sm100_desc
+@cute.jit
+def gemm_w_idx(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    zero_init: bool | Boolean = False,
+    swap_AB: bool = False,
+    num_unroll_groups: int = 1,
+) -> None:
+    if const_expr(swap_AB):
+        return gemm_w_idx(
+            tiled_mma, acc, tCrB, tCrA, B_idx, A_idx, zero_init=zero_init, swap_AB=False
+        )
+    else:
+        rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+        rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+        mma_atom = cute.make_mma_atom(tiled_mma.op)
+        for k in cutlass.range(
+            cute.size(tCrA.shape[2]), unroll=cute.size(tCrA.shape[2]) // num_unroll_groups
+        ):
+            mma_atom.set(tcgen05.Field.ACCUMULATE, not zero_init or k != 0)
+            cute.gemm(mma_atom, acc, rA[None, None, k], rB[None, None, k], acc)
+@cute.jit
+def gemm_ptx_w_idx(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    sA: Optional[cute.Tensor],
+    sB: cute.Tensor,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    zero_init: bool | Boolean = False,
+    cta_group: int = 1,
+    **kwargs,
+) -> None:
+    rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+    rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+    sA_cur = None
+    if const_expr(sA is not None):
+        sA_cur = sA if const_expr(A_idx is None) else sA[None, None, None, A_idx]
+    sB_cur = sB if const_expr(B_idx is None) else sB[None, None, None, B_idx]
+    mma_atom = cute.make_mma_atom(tiled_mma.op)
+    acc_tmem_addr = acc.iterator.toint()
+    gemm_ptx_partial(
+        mma_atom.op,
+        acc_tmem_addr,
+        rA,
+        rB,
+        sA_cur,
+        sB_cur,
+        zero_init=zero_init,
+        cta_group=cta_group,
+        **kwargs,
+    )
+@cute.jit
+def gemm(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    zero_init: bool | Boolean = False,
+) -> None:
+    mma_atom = cute.make_mma_atom(tiled_mma.op)
+    for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+        mma_atom.set(tcgen05.Field.ACCUMULATE, not zero_init or k != 0)
+        cute.gemm(mma_atom, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+def i64_to_i32x2(i: int) -> Tuple[int, int]:
+    """Convert a 64-bit integer to a tuple of two 32-bit integers."""
+    return i & 0xFFFF_FFFF, (i >> 32) & 0xFFFF_FFFF
+@cute.jit
+def gemm_ptx(
+    op: cute.nvgpu.tcgen05.mma.MmaOp,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    sA: Optional[cute.Tensor],
+    sB: cute.Tensor,
+    zero_init: bool | Boolean = False,
+) -> None:
+    is_ts = op.a_src == cute.nvgpu.tcgen05.OperandSource.TMEM
+    if const_expr(not is_ts):
+        assert sA is not None, "sA must be provided when a_src is not TMEM"
+    sA_layout = sA.layout if sA is not None else None
+    sB_layout = sB.layout
+    idesc: int = const_expr(sm100_desc.mma_op_to_idesc(op))
+    if const_expr(not is_ts):
+        sA_swizzle = sA.iterator.type.swizzle_type
+        smem_desc_base_a: int = const_expr(
+            sm100_desc.make_smem_desc_base(
+                cute.recast_layout(128, op.a_dtype.width, sA_layout[0]),
+                sA_swizzle,
+                sm100_desc.Major.K
+                if const_expr(op.a_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+                else sm100_desc.Major.MN,
+            )
+        )
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+        smem_desc_base_a_lo = const_expr(smem_desc_base_a_lo)
+        smem_desc_a_hi = const_expr(smem_desc_a_hi)
+    else:
+        smem_desc_base_a = None
+        smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    sB_swizzle = sB.iterator.type.swizzle_type
+    smem_desc_base_b: int = const_expr(
+        sm100_desc.make_smem_desc_base(
+            cute.recast_layout(128, op.b_dtype.width, sB_layout[0]),
+            sB_swizzle,
+            sm100_desc.Major.K
+            if const_expr(op.b_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+            else sm100_desc.Major.MN,
+        )
+    )
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    smem_desc_base_b_lo = const_expr(smem_desc_base_b_lo)
+    smem_desc_b_hi = const_expr(smem_desc_b_hi)
+    if const_expr(not is_ts):
+        smem_desc_start_a_lo = Int32(smem_desc_base_a_lo) | sm100_desc.make_smem_desc_start_addr(
+            sA[None, None, 0].iterator
+        )
+    else:
+        smem_desc_start_a_lo = None
+    smem_desc_start_b_lo = Int32(smem_desc_base_b_lo) | sm100_desc.make_smem_desc_start_addr(
+        sB[None, None, 0].iterator
+    )
+    for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+        if const_expr(not is_ts):
+            smem_desc_a_lo = smem_desc_start_a_lo + (
+                (cute.crd2idx((0, 0, k), sA_layout) * sA.element_type.width // 8) >> 4
+            )
+        smem_desc_b_lo = smem_desc_start_b_lo + (
+            (cute.crd2idx((0, 0, k), sB_layout) * sB.element_type.width // 8) >> 4
+        )
+        # with cute.arch.elect_one():
+        #     cute.printf("smem_desc_a_lo = {}, smem_desc_b_lo = {}", smem_desc_a_lo, smem_desc_b_lo)
+        #     cute.printf("smem_desc_a_lo_correct = {}, smem_desc_b_lo_correct = {}", smem_desc_a_lo_correct, smem_desc_b_lo_correct)
+        with cute.arch.elect_one():
+            if const_expr(not is_ts):
+                llvm.inline_asm(
+                    None,
+                    [
+                        acc.iterator.toint().ir_value(),
+                        smem_desc_a_lo.ir_value(),
+                        smem_desc_b_lo.ir_value(),
+                        Int32(not zero_init or k != 0).ir_value(),
+                    ],
+                    "{\n\t"
+                    ".reg .pred p;\n\t"
+                    ".reg .b64 smem_desc_a, smem_desc_b;\n\t"
+                    ".reg .b32 idesc;\n\t"
+                    f"mov.b32 idesc, {hex(idesc)};\n\t"
+                    f"mov.b64 smem_desc_a, {{$1, {hex(smem_desc_a_hi)}}};\n\t"
+                    f"mov.b64 smem_desc_b, {{$2, {hex(smem_desc_b_hi)}}};\n\t"
+                    "setp.ne.b32 p, $3, 0;\n\t"
+                    f"tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, p;\n\t"
+                    "}\n",
+                    "r,r,r,r",
+                    has_side_effects=True,
+                    is_align_stack=False,
+                    asm_dialect=llvm.AsmDialect.AD_ATT,
+                )
+            else:
+                llvm.inline_asm(
+                    None,
+                    [
+                        acc.iterator.toint().ir_value(),
+                        tCrA[None, None, k].iterator.toint().ir_value(),
+                        smem_desc_b_lo.ir_value(),
+                        Int32(not zero_init or k != 0).ir_value(),
+                    ],
+                    "{\n\t"
+                    ".reg .pred p;\n\t"
+                    ".reg .b64 smem_desc_b;\n\t"
+                    f"mov.b64 smem_desc_b, {{$2, {hex(smem_desc_b_hi)}}};\n\t"
+                    "setp.ne.b32 p, $3, 0;\n\t"
+                    f"tcgen05.mma.cta_group::1.kind::f16 [$0], [$1], smem_desc_b, {hex(idesc)}, p;\n\t"
+                    "}\n",
+                    "r,r,r,r",
+                    has_side_effects=True,
+                    is_align_stack=False,
+                    asm_dialect=llvm.AsmDialect.AD_ATT,
+                )
+@cute.jit
+def gemm_ptx_loop(
+    op: cute.nvgpu.tcgen05.mma.MmaOp,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    sA: Optional[cute.Tensor],
+    sB: cute.Tensor,
+    zero_init: bool | Boolean = False,
+) -> None:
+    is_ts = op.a_src == cute.nvgpu.tcgen05.OperandSource.TMEM
+    if const_expr(not is_ts):
+        assert sA is not None, "sA must be provided when a_src is not TMEM"
+    sA_layout = sA.layout if sA is not None else tCrA.layout
+    sB_layout = sB.layout
+    idesc: int = const_expr(sm100_desc.mma_op_to_idesc(op))
+    if const_expr(not is_ts):
+        sA_swizzle = sA.iterator.type.swizzle_type
+        smem_desc_base_a: int = const_expr(
+            sm100_desc.make_smem_desc_base(
+                cute.recast_layout(128, op.a_dtype.width, sA_layout[0]),
+                sA_swizzle,
+                sm100_desc.Major.K
+                if const_expr(op.a_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+                else sm100_desc.Major.MN,
+            )
+        )
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+        smem_desc_base_a_lo = const_expr(smem_desc_base_a_lo)
+        smem_desc_a_hi = const_expr(smem_desc_a_hi)
+    else:
+        smem_desc_base_a = None
+        smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    sB_swizzle = sB.iterator.type.swizzle_type
+    smem_desc_base_b: int = const_expr(
+        sm100_desc.make_smem_desc_base(
+            cute.recast_layout(128, op.b_dtype.width, sB_layout[0]),
+            sB_swizzle,
+            sm100_desc.Major.K
+            if const_expr(op.b_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+            else sm100_desc.Major.MN,
+        )
+    )
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    smem_desc_base_b_lo = const_expr(smem_desc_base_b_lo)
+    smem_desc_b_hi = const_expr(smem_desc_b_hi)
+    if const_expr(not is_ts):
+        offset_a = [
+            (cute.crd2idx((0, 0, k), sA_layout) * sA.element_type.width // 8) >> 4
+            for k in cutlass.range_constexpr(cute.size(tCrA.shape[2]))
+        ]
+    else:
+        offset_a = [
+            cute.crd2idx((0, 0, k), sA_layout) * op.a_dtype.width // 32
+            for k in cutlass.range_constexpr(cute.size(tCrA.shape[2]))
+        ]
+    offset_a_diff = [
+        offset_a[k] - offset_a[k - 1] for k in cutlass.range_constexpr(1, cute.size(tCrA.shape[2]))
+    ]
+    offset_b = [
+        (cute.crd2idx((0, 0, k), sB_layout) * sB.element_type.width // 8) >> 4
+        for k in cutlass.range_constexpr(cute.size(tCrB.shape[2]))
+    ]
+    offset_b_diff = [
+        offset_b[k] - offset_b[k - 1] for k in cutlass.range_constexpr(1, cute.size(tCrB.shape[2]))
+    ]
+    if const_expr(not is_ts):
+        smem_desc_start_a_lo = Int32(
+            smem_desc_base_a_lo | sm100_desc.make_smem_desc_start_addr(sA[None, None, 0].iterator)
+        )
+    else:
+        smem_desc_start_a_lo = None
+    smem_desc_start_b_lo = Int32(
+        smem_desc_base_b_lo | sm100_desc.make_smem_desc_start_addr(sB[None, None, 0].iterator)
+    )
+    pred_str = "p" if isinstance(zero_init, Boolean) else "0" if zero_init else "1"
+    if const_expr(not is_ts):
+        llvm.inline_asm(
+            None,
+            [
+                acc.iterator.toint().ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                Int32(not zero_init).ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 smem_desc_a_lo, smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_a_hi, smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_a, smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            "mov.b32 smem_desc_a_lo, $1;\n\t"
+            "mov.b32 smem_desc_b_lo, $2;\n\t"
+            f"mov.b32 smem_desc_a_hi, {hex(smem_desc_a_hi)};\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $3, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    f"add.u32 smem_desc_a_lo, smem_desc_a_lo, {hex(offset_a_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], smem_desc_a, smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in cutlass.range_constexpr(1, cute.size(tCrA.shape[2]))
+            )
+            + "}\n",
+            "r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    else:
+        llvm.inline_asm(
+            None,
+            [
+                acc.iterator.toint().ir_value(),
+                Int32(tCrA[None, None, 0].iterator.toint()).ir_value(),
+                Int32(smem_desc_start_b_lo).ir_value(),
+                Int32(not zero_init).ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_a;\n\t"
+            ".reg .b32 smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            "mov.b32 tmem_a, $1;\n\t"
+            "mov.b32 smem_desc_b_lo, $2;\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $3, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"add.u32 tmem_a, tmem_a, {hex(offset_a_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    # f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, 1;\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a + {hex(offset_a[k])}], smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in cutlass.range_constexpr(1, cute.size(tCrA.shape[2]))
+            )
+            + "}\n",
+            "r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+@cute.jit
+def gemm_ptx_partial(
+    op: cute.nvgpu.tcgen05.mma.MmaOp,
+    acc_tmem_addr: Int32,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    sA: Optional[cute.Tensor],
+    sB: cute.Tensor,
+    mbar_ptr: Optional[cutlass.Pointer] = None,
+    mbar_phase: Optional[Int32] = None,
+    split_arrive: Optional[int] = None,
+    zero_init: bool | Boolean = False,
+    # sA_offset: Int32 = 0,
+    # acc_offset: Int32 = 0,
+    tA_addr: Optional[Int32] = None,
+    cta_group: int = 1,
+) -> None:
+    # acc_tmem_addr += acc_offset
+    is_ts = op.a_src == cute.nvgpu.tcgen05.OperandSource.TMEM
+    if const_expr(not is_ts):
+        assert sA is not None, "sA must be provided when a_src is not TMEM"
+    sA_layout = sA.layout if sA is not None else tCrA.layout
+    sB_layout = sB.layout
+    idesc: int = const_expr(sm100_desc.mma_op_to_idesc(op))
+    if const_expr(not is_ts):
+        sA_swizzle = sA.iterator.type.swizzle_type
+        smem_desc_base_a: int = const_expr(
+            sm100_desc.make_smem_desc_base(
+                cute.recast_layout(128, op.a_dtype.width, sA_layout[0]),
+                sA_swizzle,
+                sm100_desc.Major.K
+                if const_expr(op.a_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+                else sm100_desc.Major.MN,
+            )
+        )
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+        smem_desc_base_a_lo = const_expr(smem_desc_base_a_lo)
+        smem_desc_a_hi = const_expr(smem_desc_a_hi)
+    else:
+        smem_desc_base_a = None
+        smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    sB_swizzle = sB.iterator.type.swizzle_type
+    smem_desc_base_b: int = const_expr(
+        sm100_desc.make_smem_desc_base(
+            cute.recast_layout(128, op.b_dtype.width, sB_layout[0]),
+            sB_swizzle,
+            sm100_desc.Major.K
+            if const_expr(op.b_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+            else sm100_desc.Major.MN,
+        )
+    )
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    smem_desc_base_b_lo = const_expr(smem_desc_base_b_lo)
+    smem_desc_b_hi = const_expr(smem_desc_b_hi)
+    tCrA_layout = (
+        tCrA.layout
+        if const_expr(not is_ts)
+        else cute.recast_layout(32, tCrA.element_type.width, tCrA.layout)
+    )
+    offset_a = [cute.crd2idx((0, 0, k), tCrA_layout) for k in range(cute.size(tCrA.shape[2]))]
+    offset_a_diff = [offset_a[k] - offset_a[k - 1] for k in range(1, cute.size(tCrA.shape[2]))]
+    offset_b = [cute.crd2idx((0, 0, k), tCrB.layout) for k in range(cute.size(tCrB.shape[2]))]
+    offset_b_diff = [offset_b[k] - offset_b[k - 1] for k in range(1, cute.size(tCrB.shape[2]))]
+    if const_expr(not is_ts):
+        smem_desc_start_a_lo = Int32(
+            smem_desc_base_a_lo | sm100_desc.make_smem_desc_start_addr(sA[None, None, 0].iterator)
+        )
+        # ) + sA_offset
+    else:
+        smem_desc_start_a_lo = None
+    smem_desc_start_b_lo = Int32(
+        smem_desc_base_b_lo | sm100_desc.make_smem_desc_start_addr(sB[None, None, 0].iterator)
+    )
+    pred_str = "p" if isinstance(zero_init, Boolean) else "0" if zero_init else "1"
+    if const_expr(not is_ts):
+        assert mbar_ptr is None, "mbar_ptr must be None when a_src is not TMEM"
+        llvm.inline_asm(
+            None,
+            [
+                # acc.iterator.toint().ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                Int32(not zero_init).ir_value(),
+                Int32(cute.arch.make_warp_uniform(acc_tmem_addr)).ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 smem_desc_a_lo_start, smem_desc_b_lo_start;\n\t"
+            ".reg .b32 smem_desc_a_lo, smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_a_hi, smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_a, smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            # f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            f"mov.b32 tmem_acc, $3;\n\t"
+            "mov.b32 smem_desc_a_lo_start, $0;\n\t"
+            "mov.b32 smem_desc_b_lo_start, $1;\n\t"
+            f"mov.b32 smem_desc_a_hi, {hex(smem_desc_a_hi)};\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_a, {{smem_desc_a_lo_start, smem_desc_a_hi}};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo_start, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $2, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"add.u32 smem_desc_a_lo, smem_desc_a_lo, {hex(offset_a_diff[k - 1])};\n\t"
+                    # f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_a_lo, smem_desc_a_lo_start, {hex(offset_a[k])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in range(1, cute.size(tCrA.shape[2]))
+            )
+            + "}\n",
+            # "r,r,r",
+            "r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    else:
+        # For TS gemm, somehow tCrA.iterator.toint() returns 0 no matter what, so we need to
+        # explicitly pass in the tA_addr for correctness.
+        tA_addr = tCrA[None, None, 0].iterator.toint() if tA_addr is None else tA_addr
+        input_args = [
+            # Int32(cute.arch.make_warp_uniform(tCrA[None, None, 0].iterator.toint())).ir_value(),
+            Int32(cute.arch.make_warp_uniform(tA_addr)).ir_value(),
+            Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+            Int32(not zero_init).ir_value(),
+            Int32(cute.arch.make_warp_uniform(acc_tmem_addr)).ir_value(),
+        ]
+        if const_expr(mbar_ptr is not None):
+            assert mbar_phase is not None, "mbar_phase must be provided when mbar_ptr is not None"
+            assert split_arrive is not None, (
+                "split_arrive must be provided when mbar_ptr is not None"
+            )
+            split_arrive_idx = split_arrive // op.shape_mnk[2]
+            input_args.append(mbar_ptr.toint().ir_value())
+            input_args.append(Int32(mbar_phase).ir_value())
+            mbar_wait_str = (
+                ".reg .pred P1; \n\t"
+                "LAB_WAIT: \n\t"
+                "mbarrier.try_wait.parity.shared::cta.b64 P1, [$4], $5, 10000000; \n\t"
+                "@P1 bra DONE; \n\t"
+                "bra     LAB_WAIT; \n\t"
+                "DONE: \n\t"
+            )
+        else:
+            mbar_wait_str = ""
+        llvm.inline_asm(
+            None,
+            # [
+            #     # acc.iterator.toint().ir_value(),
+            #     Int32(tCrA[None, None, 0].iterator.toint()).ir_value(),
+            #     Int32(smem_desc_start_b_lo).ir_value(),
+            #     Int32(not zero_init).ir_value(),
+            # ],
+            input_args,
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 tmem_a;\n\t"
+            ".reg .b32 smem_desc_b_lo_start;\n\t"
+            ".reg .b32 smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            # f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            f"mov.b32 tmem_acc, $3;\n\t"
+            f"mov.b32 tmem_a, $0;\n\t"
+            f"mov.b32 smem_desc_b_lo_start, $1;\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo_start, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $2, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"add.u32 tmem_a, tmem_a, {hex(offset_a_diff[k - 1])};\n\t"
+                    # f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    # f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, 1;\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a + {hex(offset_a[k])}], smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in range(
+                    1,
+                    cute.size(tCrA.shape[2]) if const_expr(mbar_ptr is None) else split_arrive_idx,
+                )
+            )
+            + mbar_wait_str
+            + (
+                "".join(
+                    (
+                        f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                        f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                        f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a + {hex(offset_a[k])}], smem_desc_b, idesc, 1;\n\t"
+                    )
+                    for k in range(split_arrive_idx, cute.size(tCrA.shape[2]))
+                )
+                if const_expr(mbar_ptr is not None)
+                else ""
+            )
+            + "}\n",
+            "r,r,r,r" if const_expr(mbar_ptr is None) else "r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+@cute.jit
+def gemm_ptx_partial1(
+    op: cute.nvgpu.tcgen05.mma.MmaOp,
+    acc_tmem_addr: cutlass.Constexpr[int],
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    sA_base_addr_for_desc: Int32,
+    sA_addr_offset_for_desc: cutlass.Constexpr[int],
+    sA_stage: Int32,
+    sB_base_addr_for_desc: Int32,
+    sB_addr_offset_for_desc: cutlass.Constexpr[int],
+    sB_stage: Int32,
+    sA_layout: Optional[cute.Layout],
+    sB_layout: Optional[cute.Layout],
+    sA_swizzle: Optional[cute.Swizzle],
+    sB_swizzle: cute.Swizzle,
+    zero_init: bool | Boolean = False,
+) -> None:
+    is_ts = op.a_src == cute.nvgpu.tcgen05.OperandSource.TMEM
+    if const_expr(not is_ts):
+        assert sA_layout is not None, "sA_layout must be provided when a_src is not TMEM"
+        assert sA_swizzle is not None, "sA_swizzle must be provided when a_src is not TMEM"
+    idesc: int = const_expr(sm100_desc.mma_op_to_idesc(op))
+    if const_expr(not is_ts):
+        smem_desc_base_a: int = const_expr(
+            sm100_desc.make_smem_desc_base(
+                cute.recast_layout(128, op.a_dtype.width, sA_layout[0]),
+                sA_swizzle,
+                sm100_desc.Major.K
+                if const_expr(op.a_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+                else sm100_desc.Major.MN,
+            )
+        )
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+        smem_desc_base_a_lo = const_expr(smem_desc_base_a_lo)
+        smem_desc_a_hi = const_expr(smem_desc_a_hi)
+    else:
+        smem_desc_base_a = None
+        smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    smem_desc_base_b: int = const_expr(
+        sm100_desc.make_smem_desc_base(
+            cute.recast_layout(128, op.b_dtype.width, sB_layout[0]),
+            sB_swizzle,
+            sm100_desc.Major.K
+            if const_expr(op.b_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K)
+            else sm100_desc.Major.MN,
+        )
+    )
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    smem_desc_base_b_lo = const_expr(smem_desc_base_b_lo)
+    smem_desc_b_hi = const_expr(smem_desc_b_hi)
+    mask = [Int32(0)] * 4
+    if const_expr(not is_ts):
+        offset_a = [
+            (cute.crd2idx((0, 0, k), sA_layout) * op.a_dtype.width // 8) >> 4
+            for k in range(cute.size(tCrA.shape[2]))
+        ]
+    else:
+        offset_a = [
+            cute.crd2idx((0, 0, k), sA_layout) * op.a_dtype.width // 32
+            for k in range(cute.size(tCrA.shape[2]))
+        ]
+    offset_a_diff = [offset_a[k] - offset_a[k - 1] for k in range(1, cute.size(tCrA.shape[2]))]
+    offset_b = [
+        (cute.crd2idx((0, 0, k), sB_layout) * op.b_dtype.width // 8) >> 4
+        for k in range(cute.size(tCrB.shape[2]))
+    ]
+    offset_b_diff = [offset_b[k] - offset_b[k - 1] for k in range(1, cute.size(tCrB.shape[2]))]
+    if const_expr(not is_ts):
+        # smem_desc_start_a_lo = Int32(smem_desc_base_a_lo | sm100_desc.make_smem_desc_start_addr(sA[None, None, 0].iterator))
+        smem_desc_start_a_lo = const_expr(smem_desc_base_a_lo)
+    else:
+        smem_desc_start_a_lo = None
+    # smem_desc_start_b_lo = Int32(smem_desc_base_b_lo | sm100_desc.make_smem_desc_start_addr(sB[None, None, 0].iterator))
+    smem_desc_start_b_lo = const_expr(smem_desc_base_b_lo)
+    pred_str = "p" if isinstance(zero_init, Boolean) else "0" if zero_init else "1"
+    if const_expr(not is_ts):
+        llvm.inline_asm(
+            None,
+            [
+                # acc.iterator.toint().ir_value(),
+                # Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value(),
+                Int32(sA_base_addr_for_desc).ir_value(),
+                Int32(sA_stage).ir_value(),
+                # Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                Int32(sB_base_addr_for_desc).ir_value(),
+                Int32(sB_stage).ir_value(),
+                Int32(not zero_init).ir_value(),
+                mask[0].ir_value(),
+                mask[1].ir_value(),
+                mask[2].ir_value(),
+                mask[3].ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 smem_desc_a_lo, smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_a_hi, smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_a, smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            # "mov.b32 smem_desc_a_lo, $0;\n\t"
+            # f"add.u32 smem_desc_a_lo, $0, {hex(smem_desc_start_a_lo)};\n\t"
+            f"mad.lo.u32 smem_desc_a_lo, $1, {hex(sA_addr_offset_for_desc)}, $0;\n\t"
+            # "mov.b32 smem_desc_b_lo, $2;\n\t"
+            f"mad.lo.u32 smem_desc_b_lo, $3, {hex(sB_addr_offset_for_desc)}, $2;\n\t"
+            f"mov.b32 smem_desc_a_hi, {hex(smem_desc_a_hi)};\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $4, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {{$5, $6, $7, $8}}, {pred_str};\n\t"
+            + "".join(
+                (
+                    f"add.u32 smem_desc_a_lo, smem_desc_a_lo, {hex(offset_a_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {{$5, $6, $7, $8}}, 1;\n\t"
+                )
+                for k in range(1, cute.size(tCrA.shape[2]))
+            )
+            + "}\n",
+            "r,r,r,r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    else:
+        llvm.inline_asm(
+            None,
+            [
+                # acc.iterator.toint().ir_value(),
+                Int32(tCrA[None, None, 0].iterator.toint()).ir_value(),
+                Int32(smem_desc_start_b_lo).ir_value(),
+                Int32(not zero_init).ir_value(),
+                mask[0].ir_value(),
+                mask[1].ir_value(),
+                mask[2].ir_value(),
+                mask[3].ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_a;\n\t"
+            ".reg .b32 smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            f"mov.b32 tmem_a, $1;\n\t"
+            f"mov.b32 smem_desc_b_lo, $2;\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $3, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {{$4, $5, $6, $7}}, {pred_str};\n\t"
+            + "".join(
+                (
+                    f"add.u32 tmem_a, tmem_a, {hex(offset_a_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [$0], [tmem_a], smem_desc_b, idesc, {{$4, $5, $6, $7}}, 1;\n\t"
+                )
+                for k in range(1, cute.size(tCrA.shape[2]))
+            )
+            + "}\n",
+            "r,r,r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+@cute.jit
+def gemm_ptx_precomputed(
+    acc_tmem_addr: Int32,
+    smem_desc_start_a: Int32,  # If TS, then this is the tmem start address for A
+    smem_desc_start_b: Int32,
+    idesc: int,
+    smem_desc_base_a: Optional[int],
+    smem_desc_base_b: int,
+    tCrA_layout: cute.Layout,
+    tCrB_layout: cute.Layout,
+    mbar_ptr: Optional[cutlass.Pointer] = None,
+    mbar_phase: Optional[Int32] = None,
+    zero_init: bool | Boolean = False,
+    cta_group: int = 1,
+) -> None:
+    # acc_tmem_addr += acc_offset
+    is_ts = const_expr(smem_desc_base_a is None)
+    num_k_tile = cute.size(tCrA_layout.shape[2])
+    if const_expr(not is_ts):
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+    else:
+        smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    tCrA_layout = (
+        tCrA_layout
+        if const_expr(not is_ts)
+        # else cute.recast_layout(32, tCrA.element_type.width, tCrA_layout)
+        # currently hard-coding the width to 16
+        else cute.recast_layout(32, 16, tCrA_layout)
+    )
+    offset_a = [cute.crd2idx((0, 0, k), tCrA_layout) for k in range(num_k_tile)]
+    offset_a_diff = [offset_a[k] - offset_a[k - 1] for k in range(1, num_k_tile)]
+    offset_b = [cute.crd2idx((0, 0, k), tCrB_layout) for k in range(num_k_tile)]
+    offset_b_diff = [offset_b[k] - offset_b[k - 1] for k in range(1, num_k_tile)]
+    smem_desc_start_a_lo = None
+    if const_expr(not is_ts):
+        smem_desc_start_a_lo = Int32(smem_desc_base_a_lo | smem_desc_start_a)
+        # smem_desc_start_a_lo = smem_desc_start_a
+    smem_desc_start_b_lo = Int32(smem_desc_base_b_lo | smem_desc_start_b)
+    pred_str = "p" if isinstance(zero_init, Boolean) else "0" if zero_init else "1"
+    if const_expr(not is_ts):
+        assert mbar_ptr is None, "mbar_ptr must be None when a_src is not TMEM"
+        llvm.inline_asm(
+            None,
+            [
+                # acc.iterator.toint().ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value(),
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                Int32(not zero_init).ir_value(),
+                Int32(cute.arch.make_warp_uniform(acc_tmem_addr)).ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 smem_desc_a_lo_start, smem_desc_b_lo_start;\n\t"
+            ".reg .b32 smem_desc_a_lo, smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_a_hi, smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_a, smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            # f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            f"mov.b32 tmem_acc, $3;\n\t"
+            "mov.b32 smem_desc_a_lo_start, $0;\n\t"
+            "mov.b32 smem_desc_b_lo_start, $1;\n\t"
+            f"mov.b32 smem_desc_a_hi, {hex(smem_desc_a_hi)};\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_a, {{smem_desc_a_lo_start, smem_desc_a_hi}};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo_start, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $2, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"add.u32 smem_desc_a_lo, smem_desc_a_lo, {hex(offset_a_diff[k - 1])};\n\t"
+                    # f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"add.s32 smem_desc_a_lo, smem_desc_a_lo_start, {hex(offset_a[k])};\n\t"
+                    f"add.s32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    f"mov.b64 smem_desc_a, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], smem_desc_a, smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in range(1, num_k_tile)
+            )
+            + "}\n",
+            # "r,r,r",
+            "r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    else:
+        input_args = [
+            Int32(cute.arch.make_warp_uniform(smem_desc_start_a)).ir_value(),
+            Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+            Int32(not zero_init).ir_value(),
+            Int32(cute.arch.make_warp_uniform(acc_tmem_addr)).ir_value(),
+        ]
+        if const_expr(mbar_ptr is not None):
+            assert mbar_phase is not None, "mbar_phase must be provided when mbar_ptr is not None"
+            input_args.append(mbar_ptr.toint().ir_value())
+            input_args.append(Int32(mbar_phase).ir_value())
+            mbar_wait_str = (
+                ".reg .pred P1; \n\t"
+                "LAB_WAIT: \n\t"
+                "mbarrier.try_wait.parity.shared::cta.b64 P1, [$4], $5, 10000000; \n\t"
+                "@P1 bra DONE; \n\t"
+                "bra     LAB_WAIT; \n\t"
+                "DONE: \n\t"
+            )
+        else:
+            mbar_wait_str = ""
+        llvm.inline_asm(
+            None,
+            # [
+            #     # acc.iterator.toint().ir_value(),
+            #     Int32(tCrA_layout[None, None, 0].iterator.toint()).ir_value(),
+            #     Int32(smem_desc_start_b_lo).ir_value(),
+            #     Int32(not zero_init).ir_value(),
+            # ],
+            input_args,
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 tmem_a;\n\t"
+            ".reg .b32 smem_desc_b_lo_start;\n\t"
+            ".reg .b32 smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_b_hi;\n\t"
+            ".reg .b64 smem_desc_b;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            f"mov.b32 idesc, {hex(idesc)};\n\t"
+            # f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            f"mov.b32 tmem_acc, $3;\n\t"
+            f"mov.b32 tmem_a, $0;\n\t"
+            f"mov.b32 smem_desc_b_lo_start, $1;\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 smem_desc_b, {{smem_desc_b_lo_start, smem_desc_b_hi}};\n\t"
+            "setp.ne.b32 p, $2, 0;\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"add.u32 tmem_a, tmem_a, {hex(offset_a_diff[k - 1])};\n\t"
+                    # f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                    f"add.u32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    # f"@leader_thread tcgen05.mma.cta_group::1.kind::f16 [tmem_acc], [tmem_a], smem_desc_b, idesc, 1;\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a + {hex(offset_a[k])}], smem_desc_b, idesc, 1;\n\t"
+                )
+                for k in range(
+                    1,
+                    num_k_tile if const_expr(mbar_ptr is None) else num_k_tile // 4 * 3,
+                )
+            )
+            + mbar_wait_str
+            + (
+                "".join(
+                    (
+                        # f"add.u32 smem_desc_b_lo, smem_desc_b_lo, {hex(offset_b_diff[k - 1])};\n\t"
+                        f"add.u32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                        f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                        f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], [tmem_a + {hex(offset_a[k])}], smem_desc_b, idesc, 1;\n\t"
+                    )
+                    for k in range(num_k_tile // 4 * 3, num_k_tile)
+                )
+                if const_expr(mbar_ptr is not None)
+                else ""
+            )
+            + "}\n",
+            "r,r,r,r" if const_expr(mbar_ptr is None) else "r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+@cute.jit
+def declare_ptx_smem_desc(
+    smem_desc_start_a: Int32,  # If TS, then this is the tmem start address for A
+    smem_desc_base_a: Optional[int],
+    tCrA_layout: cute.Layout,
+    var_name_prefix: str = "smem_desc",
+) -> None:
+    is_ts = const_expr(smem_desc_base_a is None)
+    num_k_tile = cute.size(tCrA_layout.shape[2])
+    smem_desc_base_a_lo, smem_desc_a_hi = None, None
+    if const_expr(not is_ts):
+        smem_desc_base_a_lo, smem_desc_a_hi = i64_to_i32x2(smem_desc_base_a)
+    tCrA_layout = (
+        tCrA_layout
+        if const_expr(not is_ts)
+        # else cute.recast_layout(32, tCrA.element_type.width, tCrA_layout)
+        # currently hard-coding the width to 16
+        else cute.recast_layout(32, 16, tCrA_layout)
+    )
+    offset_a = [cute.crd2idx((0, 0, k), tCrA_layout) for k in range(num_k_tile)]
+    smem_desc_start_a_lo = None
+    if const_expr(not is_ts):
+        smem_desc_start_a_lo = Int32(smem_desc_base_a_lo | smem_desc_start_a)
+    if const_expr(not is_ts):
+        llvm.inline_asm(
+            None,
+            [Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value()],
+            f".reg .b32 {var_name_prefix}_lo;\n\t"
+            f".reg .b64 {var_name_prefix}_<{num_k_tile}>;\n\t"
+            f"mov.b64 {var_name_prefix}_0, {{$0, {hex(smem_desc_a_hi)}}};\n\t"
+            + "".join(
+                (
+                    f"add.s32 {var_name_prefix}_lo, $0, {hex(offset_a[k])};\n\t"
+                    f"mov.b64 {var_name_prefix}_{k}, {{{var_name_prefix}_lo, {hex(smem_desc_a_hi)}}};\n\t"
+                )
+                for k in range(1, num_k_tile)
+            ),
+            "r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+@cute.jit
+def declare_ptx_idesc(op: cute.nvgpu.tcgen05.mma.MmaOp, var_name: str = "idesc") -> None:
+    idesc = const_expr(sm100_desc.mma_op_to_idesc(op))
+    llvm.inline_asm(
+        None,
+        [],
+        f".reg .b32 {var_name};\n\t"  # noqa
+        f"mov.b32 {var_name}, {hex(idesc)};\n\t",
+        constraints="",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@cute.jit
+def gemm_ptx_precomputed_varname(
+    acc_tmem_addr: Int32,
+    smem_desc_start_b: Int32,
+    # idesc: int,
+    smem_desc_base_b: int,
+    tCrB_layout: cute.Layout,
+    smem_var_name_prefix: str,
+    idesc_var_name: str,
+    smem_offset: int,
+    zero_init: bool | Boolean = False,
+    cta_group: int = 1,
+) -> None:
+    is_ts = False
+    num_k_tile = cute.size(tCrB_layout.shape[2])
+    smem_desc_base_b_lo, smem_desc_b_hi = i64_to_i32x2(smem_desc_base_b)
+    offset_b = [cute.crd2idx((0, 0, k), tCrB_layout) for k in range(num_k_tile)]
+    smem_desc_start_b_lo = Int32(smem_desc_base_b_lo | smem_desc_start_b)
+    pred_str = "p" if isinstance(zero_init, Boolean) else "0" if zero_init else "1"
+    if const_expr(not is_ts):
+        llvm.inline_asm(
+            None,
+            [
+                Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                Int32(not zero_init).ir_value(),
+                Int32(cute.arch.make_warp_uniform(acc_tmem_addr)).ir_value(),
+            ],
+            "{\n\t"
+            ".reg .pred leader_thread;\n\t"
+            ".reg .pred p;\n\t"
+            # ".reg .b32 idesc;\n\t"
+            ".reg .b32 tmem_acc;\n\t"
+            ".reg .b32 smem_desc_b_lo_start;\n\t"
+            ".reg .b32 smem_desc_a_lo, smem_desc_b_lo;\n\t"
+            ".reg .b32 smem_desc_a_hi, smem_desc_b_hi;\n\t"
+            # ".reg .b64 smem_desc_b;\n\t"
+            f".reg .b64 smem_desc_b_<{num_k_tile}>;\n\t"
+            "elect.sync _|leader_thread, -1;\n\t"
+            # f"mov.b32 idesc, {hex(idesc)};\n\t"
+            # f"mov.b32 tmem_acc, {hex(acc_tmem_addr)};\n\t"
+            f"mov.b32 tmem_acc, $2;\n\t"
+            "mov.b32 smem_desc_b_lo_start, $0;\n\t"
+            f"mov.b32 smem_desc_b_hi, {hex(smem_desc_b_hi)};\n\t"
+            f"mov.b64 {{smem_desc_a_lo, smem_desc_a_hi}}, {smem_var_name_prefix}_0;\n\t"
+            f"add.s32 smem_desc_a_lo, smem_desc_a_lo, {smem_offset};\n\t"
+            f"mov.b64 {smem_var_name_prefix}_0, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+            f"mov.b64 smem_desc_b_0, {{smem_desc_b_lo_start, smem_desc_b_hi}};\n\t"
+            + "".join(
+                (
+                    f"mov.b64 {{smem_desc_a_lo, smem_desc_a_hi}}, {smem_var_name_prefix}_{k};\n\t"
+                    f"add.s32 smem_desc_a_lo, smem_desc_a_lo, {smem_offset};\n\t"
+                    f"add.s32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    f"mov.b64 {smem_var_name_prefix}_{k}, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    f"mov.b64 smem_desc_b_{k}, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                )
+                for k in range(1, num_k_tile)
+            )
+            + "setp.ne.b32 p, $1, 0;\n\t"
+            # f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], {smem_var_name_prefix}_0, smem_desc_b, idesc, {pred_str};\n\t"
+            f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], {smem_var_name_prefix}_0, smem_desc_b_0, {idesc_var_name}, {pred_str};\n\t"
+            + "".join(
+                (
+                    # f"mov.b64 {{smem_desc_a_lo, smem_desc_a_hi}}, {smem_var_name_prefix}_{k};\n\t"
+                    # f"add.s32 smem_desc_a_lo, smem_desc_a_lo, {smem_offset};\n\t"
+                    # f"add.s32 smem_desc_b_lo, smem_desc_b_lo_start, {hex(offset_b[k])};\n\t"
+                    # f"mov.b64 {smem_var_name_prefix}_{k}, {{smem_desc_a_lo, smem_desc_a_hi}};\n\t"
+                    # f"mov.b64 smem_desc_b, {{smem_desc_b_lo, smem_desc_b_hi}};\n\t"
+                    # f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], {smem_var_name_prefix}_{k}, smem_desc_b, idesc, 1;\n\t"
+                    # f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], {smem_var_name_prefix}_{k}, smem_desc_b, {idesc_var_name}, 1;\n\t"
+                    f"@leader_thread tcgen05.mma.cta_group::{cta_group}.kind::f16 [tmem_acc], {smem_var_name_prefix}_{k}, smem_desc_b_{k}, {idesc_var_name}, 1;\n\t"
+                )
+                for k in range(1, num_k_tile)
+            )
+            + "}\n",
+            "r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )

build/torch-cuda/block_info.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+from typing import Tuple, Optional
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+from .seqlen_info import SeqlenInfoQK
+@dataclass(frozen=True)
+class BlockInfo:
+    tile_m: cutlass.Constexpr[int]
+    tile_n: cutlass.Constexpr[int]
+    is_causal: cutlass.Constexpr[bool]
+    is_local: cutlass.Constexpr[bool] = False
+    is_split_kv: cutlass.Constexpr[bool] = False
+    window_size_left: Optional[Int32] = None
+    window_size_right: Optional[Int32] = None
+    qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1
+    @cute.jit
+    def get_n_block_min_max(
+        self,
+        seqlen_info: SeqlenInfoQK,
+        m_block: Int32,
+        split_idx: cutlass.Int32 = 0,
+        num_splits: cutlass.Int32 = 1,
+    ) -> Tuple[Int32, Int32]:
+        n_block_max = cute.ceil_div(seqlen_info.seqlen_k, self.tile_n)
+        if const_expr(self.is_causal or (self.is_local and self.window_size_right is not None)):
+            m_idx_max = (m_block + 1) * self.tile_m
+            if const_expr(self.qhead_per_kvhead_packgqa > 1):
+                m_idx_max = cute.ceil_div(m_idx_max, self.qhead_per_kvhead_packgqa)
+            n_idx = m_idx_max + seqlen_info.seqlen_k - seqlen_info.seqlen_q
+            n_idx_right = n_idx if const_expr(self.is_causal) else n_idx + self.window_size_right
+            n_block_max = min(n_block_max, cute.ceil_div(n_idx_right, self.tile_n))
+        n_block_min = 0
+        if const_expr(self.is_local and self.window_size_left is not None):
+            m_idx_min = m_block * self.tile_m
+            if const_expr(self.qhead_per_kvhead_packgqa > 1):
+                m_idx_min = m_idx_min // self.qhead_per_kvhead_packgqa
+            n_idx = m_idx_min + seqlen_info.seqlen_k - seqlen_info.seqlen_q
+            n_idx_left = n_idx - self.window_size_left
+            n_block_min = cutlass.max(n_idx_left // self.tile_n, 0)
+        if cutlass.const_expr(self.is_split_kv):
+            num_n_blocks_per_split = (
+                cutlass.Int32(0)
+                if n_block_max <= n_block_min
+                else (n_block_max - n_block_min + num_splits - 1) // num_splits
+            )
+            n_block_min = n_block_min + split_idx * num_n_blocks_per_split
+            n_block_max = cutlass.min(n_block_min + num_n_blocks_per_split, n_block_max)
+        return n_block_min, n_block_max
+    @cute.jit
+    def get_m_block_min_max(self, seqlen_info: SeqlenInfoQK, n_block: Int32) -> Tuple[Int32, Int32]:
+        m_block_max = cute.ceil_div(seqlen_info.seqlen_q, self.tile_m)
+        m_block_min = 0
+        if const_expr(self.is_causal or (self.is_local and self.window_size_right is not None)):
+            n_idx_min = n_block * self.tile_n
+            m_idx = n_idx_min + seqlen_info.seqlen_q - seqlen_info.seqlen_k
+            m_idx_right = m_idx if const_expr(self.is_causal) else m_idx - self.window_size_right
+            m_block_min = max(m_block_min, m_idx_right // self.tile_m)
+        if const_expr(self.is_local and self.window_size_left is not None):
+            n_idx_max = (n_block + 1) * self.tile_n
+            m_idx = n_idx_max + seqlen_info.seqlen_q - seqlen_info.seqlen_k
+            m_idx_left = m_idx + self.window_size_left
+            m_block_max = min(m_block_max, cute.ceil_div(m_idx_left, self.tile_m))
+        return m_block_min, m_block_max
+    @cute.jit
+    def get_n_block_min_causal_local_mask(
+        self,
+        seqlen_info: SeqlenInfoQK,
+        m_block: Int32,
+        n_block_min: Int32,
+    ) -> Int32:
+        """If we have separate iterations with causal or local masking at the start, where do we stop"""
+        m_idx_min = m_block * self.tile_m
+        if const_expr(self.qhead_per_kvhead_packgqa > 1):
+            m_idx_min = m_idx_min // self.qhead_per_kvhead_packgqa
+        n_idx = m_idx_min + seqlen_info.seqlen_k - seqlen_info.seqlen_q
+        n_idx_right = (
+            n_idx
+            if const_expr(not self.is_local or self.window_size_right is None)
+            else n_idx + self.window_size_right
+        )
+        return cutlass.max(n_block_min, n_idx_right // self.tile_n)
+    @cute.jit
+    def get_n_block_min_before_local_mask(
+        self,
+        seqlen_info: SeqlenInfoQK,
+        m_block: Int32,
+        n_block_min: Int32,
+    ) -> Int32:
+        """If we have separate iterations with local masking at the end, where do we stop the non-masked iterations"""
+        if const_expr(not self.is_local or self.window_size_left is None):
+            return n_block_min
+        else:
+            m_idx_max = (m_block + 1) * self.tile_m
+            if const_expr(self.qhead_per_kvhead_packgqa > 1):
+                m_idx_max = cute.ceil_div(m_idx_max, self.qhead_per_kvhead_packgqa)
+            n_idx = m_idx_max + seqlen_info.seqlen_k - seqlen_info.seqlen_q
+            n_idx_left = n_idx - self.window_size_left
+            return cutlass.max(n_block_min, cute.ceil_div(n_idx_left, self.tile_n))

build/torch-cuda/block_sparse_utils.py ADDED Viewed

	@@ -0,0 +1,1476 @@

+"""
+Block-sparse runtime utilities for CUTE DSL kernels.
+This module contains runtime execution functions for block-sparse attention kernels.
+These utilities are used by CUTE DSL kernels to produce and consume block-sparse loads.
+"""
+from typing import Callable, Optional
+from functools import partial
+import math
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from .quack import copy_utils
+# Import data structures from block_sparsity
+from .block_sparsity import BlockSparseTensors
+from .named_barrier import NamedBarrierBwd
+# NOTE [SM100 block-sparse empty tiles: mbarrier contract]
+#
+# For block-sparse SM100 forward, a given (m_block, stage) Q tile can have zero active
+# KV blocks (total_block_cnt == 0). In that case there is no seqlen_kv iteration, so
+# the softmax warp-group has no row stats to publish.
+#
+# The correction warp-group seeds fully-masked-row stats and runs the usual correction
+# epilogue so output/LSE have well-defined values. Both warp-groups must still perform
+# the softmax<->correction mbarrier handshake so phases advance correctly across
+# empty->empty and empty->non-empty tile sequences.
+#
+# In the no-sink case, this corresponds to the usual fully-masked-row convention:
+# output is zero and LSE is -inf.
+#
+# Barrier contract (each is `mbar_ptr + <offset> + stage`):
+#
+# Producer/consumer pairs:
+# - `mbar_softmax_corr_full`    : softmax arrive        -> correction wait
+# - `mbar_softmax_corr_empty`   : correction arrive     -> softmax wait
+# - `mbar_P_full_O_rescaled`    : softmax arrive (+ correction arrive) -> MMA wait
+# - `mbar_P_full_2`             : softmax arrive        -> MMA wait
+# - `mbar_corr_epi_full_/empty` : correction <-> epilogue (only when epilogue is separate)
+#
+# Empty tile (`total_block_cnt == 0`):
+# - Softmax: skips the seqlen_kv softmax path entirely (no P stores, no `mbar_P_full_*`).
+#   It only arrives `mbar_softmax_corr_full` once per stage as a synthetic "no work" signal.
+#   At the `softmax_loop` level, softmax unconditionally waits `mbar_softmax_corr_empty`
+#   before each tile (when block-sparse) to drain a prior correction arrival and keep
+#   phases aligned across non-empty -> empty transitions.
+# - Correction: waits `mbar_softmax_corr_full`, seeds stats + runs `correction_epilogue(scale=0)`,
+#   and arrives `mbar_softmax_corr_empty` (and `mbar_corr_epi_full_/empty` when applicable).
+# - No `mbar_P_full_*` barriers are arrived (no P, no MMA O); only the softmax<->correction
+#   (and correction<->epilogue) handshakes advance phases.
+#
+# Non-empty tile:
+# - Softmax: runs `softmax_step` (produces P) and uses `mbar_softmax_corr_full/empty` to
+#   publish row_max (during seqlen_kv) and final row stats (once per tile), and to advance phases;
+#   arrives `mbar_P_full_*` when P is stored.
+# - Correction: waits `mbar_softmax_corr_full`, may rescale/release O, arrives `mbar_softmax_corr_empty`
+#   to ack/advance, and arrives `mbar_P_full_O_rescaled` when MMA can proceed.
+#
+# Backward (SM100):
+# - Empty KV tile: for a given `n_block`, `total_m_block_cnt == 0` means no Q tiles contribute.
+# - Both the load and compute loops guard all pipeline work on `process_tile`, so empty tiles
+#   skip producer/consumer operations entirely (no per-tile mbarrier phase handshake like forward).
+# - In the `not dKV_postprocess` path, dK/dV for empty KV tiles are explicitly written as zeros
+#   even when `process_tile == False` (see `flash_bwd_sm100.py` `should_zero_dKV`).
+@cute.jit
+def load_block_list(
+    block_indices: cute.Tensor,
+    block_count,
+    load_q_with_first: cutlass.Constexpr,
+    first_block_preloaded: cutlass.Constexpr,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_k,
+    pipeline_v,
+    use_tma_q: cutlass.Constexpr,
+    tma_q_bytes: cutlass.Constexpr,
+    intra_wg_overlap: cutlass.Constexpr,
+):
+    """Iterate over the sparse blocks and load K, V (and Q) into the pipeline.
+    for the intra_wg_overlap case, we overlap the loads of K and V. And this
+    means we need to pipeline the last V load from the partial block case,
+    with the loads for the full blocks. Set first_block_preloaded when the
+    caller has already issued the first K load for the list.
+    Note:
+        we iterate along the block_n indices in reverse.
+    Returns:
+        Updated kv_producer_state after processing the block list.
+    """
+    if block_count > 0:
+        if const_expr(not intra_wg_overlap):
+            # Peel first iteration: the first block may need to load Q alongside K,
+            # Parameters are already Constexpr, so no need to wrap in const_expr()
+            n_block_first = block_indices[block_count - 1]
+            extra_tx = tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
+            pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
+            if const_expr(load_q_with_first and use_tma_q):
+                load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
+            load_K(src_idx=n_block_first, producer_state=kv_producer_state)
+            pipeline_v.producer_acquire(kv_producer_state)
+            load_V(src_idx=n_block_first, producer_state=kv_producer_state)
+            kv_producer_state.advance()
+            for offset in cutlass.range(1, block_count):
+                n_block = block_indices[block_count - 1 - offset]
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state)
+                load_V(src_idx=n_block, producer_state=kv_producer_state)
+                kv_producer_state.advance()
+        else:
+            n_block_first = block_indices[block_count - 1]
+            if const_expr(not first_block_preloaded):
+                extra_tx = (
+                    tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
+                )
+                pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
+                if const_expr(load_q_with_first and use_tma_q):
+                    load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
+                load_K(src_idx=n_block_first, producer_state=kv_producer_state)
+            for idx in cutlass.range(block_count - 1, unroll=1):
+                n_block_prev = block_indices[block_count - 1 - idx]
+                n_block = block_indices[block_count - 2 - idx]
+                kv_producer_state_prev = kv_producer_state.clone()
+                kv_producer_state.advance()
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state_prev)
+                load_V(src_idx=n_block_prev, producer_state=kv_producer_state_prev)
+    return kv_producer_state
+@cute.jit
+def finish_overlap_v_load(
+    block_indices: cute.Tensor,
+    block_count,
+    load_V,
+    pipeline_v,
+    kv_producer_state,
+):
+    """Load the final V block after overlapped K/V loads."""
+    if block_count > 0:
+        n_block_last = block_indices[0]
+        pipeline_v.producer_acquire(kv_producer_state)
+        load_V(src_idx=n_block_last, producer_state=kv_producer_state)
+        kv_producer_state.advance()
+    return kv_producer_state
+@cute.jit
+def sparse_tensor_m_block(
+    m_block,
+    qhead_per_kvhead: cutlass.Constexpr[int],
+    q_subtile_factor: cutlass.Constexpr[int],
+):
+    """Map packed m_block indices to block-sparse tensor indices."""
+    block = m_block
+    if const_expr(qhead_per_kvhead != 1):
+        block = block // qhead_per_kvhead
+    if const_expr(q_subtile_factor != 1):
+        block = block // q_subtile_factor
+    return block
+@cute.jit
+def produce_block_sparse_loads(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_k,
+    pipeline_v,
+    use_tma_q: cutlass.Constexpr,
+    tma_q_bytes: cutlass.Constexpr,
+    intra_wg_overlap: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    q_subtile_factor: cutlass.Constexpr[int] = 1,
+):
+    """Iterate over the mask and full block lists for a single tile.
+    The masked (partial) list may leave the last V load pending when intra-warp-group
+    overlap is enabled. The first full block must consume that pending V while
+    issuing its own K load on the next pipeline stage.
+    In the intra-wg-overlap path, the last masked block leaves its V copy in flight
+    while we advance the producer state to start the next full K. Either the full list
+    overlaps that pending V load, or, if no full blocks exist, we explicitly drain it.
+    Args:
+        qhead_per_kvhead: Pack-GQA factor. When > 1, m_block is in packed space and
+            must be converted to unpacked for sparse tensor indexing.
+    """
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead, q_subtile_factor)
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    mask_empty = curr_mask_block_cnt == 0
+    full_empty = curr_full_block_cnt == 0
+    if mask_empty:
+        # No masked blocks: the full list owns the initial Q+K load.
+        kv_producer_state = load_block_list(
+            curr_full_block_idx,
+            curr_full_block_cnt,
+            load_q_with_first=True,
+            first_block_preloaded=False,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_k=pipeline_k,
+            pipeline_v=pipeline_v,
+            use_tma_q=use_tma_q,
+            tma_q_bytes=tma_q_bytes,
+            intra_wg_overlap=intra_wg_overlap,
+        )
+        if const_expr(intra_wg_overlap) and curr_full_block_cnt > 0:
+            kv_producer_state = finish_overlap_v_load(
+                curr_full_block_idx,
+                curr_full_block_cnt,
+                load_V,
+                pipeline_v,
+                kv_producer_state,
+            )
+    else:
+        # Masked blocks present: load Q together with the first masked K so consumers can
+        # start immediately. When overlap is disabled this fully drains the list.
+        kv_producer_state = load_block_list(
+            curr_mask_block_idx,
+            curr_mask_block_cnt,
+            load_q_with_first=True,
+            first_block_preloaded=False,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_k=pipeline_k,
+            pipeline_v=pipeline_v,
+            use_tma_q=use_tma_q,
+            tma_q_bytes=tma_q_bytes,
+            intra_wg_overlap=intra_wg_overlap,
+        )
+        if full_empty:
+            if const_expr(intra_wg_overlap):
+                kv_producer_state = finish_overlap_v_load(
+                    curr_mask_block_idx,
+                    curr_mask_block_cnt,
+                    load_V,
+                    pipeline_v,
+                    kv_producer_state,
+                )
+        else:
+            if const_expr(intra_wg_overlap):
+                # Bridge the masked list to the full list by overlapping the pending masked V
+                # with the first full K load.
+                n_block_mask_last = curr_mask_block_idx[0]
+                n_block_full_first = curr_full_block_idx[curr_full_block_cnt - 1]
+                kv_producer_state_prev = kv_producer_state.clone()
+                kv_producer_state.advance()
+                pipeline_k.producer_acquire(kv_producer_state)
+                load_K(src_idx=n_block_full_first, producer_state=kv_producer_state)
+                pipeline_v.producer_acquire(kv_producer_state_prev)
+                load_V(src_idx=n_block_mask_last, producer_state=kv_producer_state_prev)
+                kv_producer_state = load_block_list(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_q_with_first=False,
+                    first_block_preloaded=True,
+                    kv_producer_state=kv_producer_state,
+                    load_Q=load_Q,
+                    load_K=load_K,
+                    load_V=load_V,
+                    pipeline_k=pipeline_k,
+                    pipeline_v=pipeline_v,
+                    use_tma_q=use_tma_q,
+                    tma_q_bytes=tma_q_bytes,
+                    intra_wg_overlap=intra_wg_overlap,
+                )
+                kv_producer_state = finish_overlap_v_load(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_V,
+                    pipeline_v,
+                    kv_producer_state,
+                )
+            else:
+                # Non-overlap path with both lists: run the full list normally (skipping the Q
+                # reload because the masked list already issued it).
+                kv_producer_state = load_block_list(
+                    curr_full_block_idx,
+                    curr_full_block_cnt,
+                    load_q_with_first=False,
+                    first_block_preloaded=False,
+                    kv_producer_state=kv_producer_state,
+                    load_Q=load_Q,
+                    load_K=load_K,
+                    load_V=load_V,
+                    pipeline_k=pipeline_k,
+                    pipeline_v=pipeline_v,
+                    use_tma_q=use_tma_q,
+                    tma_q_bytes=tma_q_bytes,
+                    intra_wg_overlap=intra_wg_overlap,
+                )
+    return kv_producer_state
+@cute.jit
+def consume_block_sparse_loads(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    seqlen,
+    kv_consumer_state,
+    mma_pv_fn,
+    mma_one_n_block,
+    process_first_half_block,
+    process_last_half_block,
+    mask_fn,
+    score_mod_fn,
+    O_should_accumulate,
+    mask_mod,
+    fastdiv_mods,
+    intra_wg_overlap: cutlass.Constexpr,
+    warp_scheduler_barrier_sync: Callable,
+    warp_scheduler_barrier_arrive: Callable,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    q_subtile_factor: cutlass.Constexpr[int] = 1,
+):
+    """Consume the mask and full block lists for a single tile on the consumer side.
+    Mirrors `produce_block_sparse_loads` so that the consumer pipeline uses
+    the same sparse tensor indexing.
+    Args:
+        qhead_per_kvhead: Pack-GQA factor. When > 1, m_block is in packed space and
+            must be converted to unpacked for sparse tensor indexing.
+    """
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead, q_subtile_factor)
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    processed_any = curr_mask_block_cnt + curr_full_block_cnt > 0
+    if const_expr(not intra_wg_overlap):
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            warp_scheduler_barrier_sync()
+            kv_consumer_state = mma_one_n_block(
+                kv_consumer_state,
+                n_block=mask_n_block,
+                mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                mask_fn=partial(
+                    mask_fn,
+                    mask_mod=mask_mod,
+                    mask_seqlen=True,
+                    fastdiv_mods=fastdiv_mods if cutlass.const_expr(mask_mod is not None) else None,
+                ),
+                is_first_n_block=True,
+            )
+            O_should_accumulate = True
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=mask_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=mask_mod, mask_seqlen=False),
+                    is_first_n_block=False,
+                )
+                O_should_accumulate = True
+            if curr_full_block_cnt == 0:
+                warp_scheduler_barrier_arrive()
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                warp_scheduler_barrier_sync()
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_seqlen=True),
+                    is_first_n_block=True,
+                )
+                O_should_accumulate = True
+                for i in cutlass.range(1, curr_full_block_cnt):
+                    full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=full_n_block,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                        mask_fn=partial(mask_fn, mask_seqlen=False),
+                        is_first_n_block=False,
+                    )
+                    O_should_accumulate = True
+            else:
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                    is_first_n_block=False,
+                )
+                O_should_accumulate = True
+                for i in cutlass.range(1, curr_full_block_cnt):
+                    full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=full_n_block,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                        mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=False),
+                        is_first_n_block=False,
+                    )
+                    O_should_accumulate = True
+            warp_scheduler_barrier_arrive()
+    else:
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            kv_consumer_state = process_first_half_block(
+                n_block=mask_n_block,
+                seqlen=seqlen,
+                kv_consumer_state=kv_consumer_state,
+                mask_fn=partial(
+                    mask_fn,
+                    mask_mod=mask_mod,
+                    mask_seqlen=True,
+                    fastdiv_mods=fastdiv_mods if cutlass.const_expr(mask_mod is not None) else None,
+                ),
+                score_mod_fn=score_mod_fn,
+                is_first_block=True,
+            )
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=mask_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=mask_mod, mask_seqlen=False),
+                )
+                O_should_accumulate = True
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                kv_consumer_state = process_first_half_block(
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    kv_consumer_state=kv_consumer_state,
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                    score_mod_fn=score_mod_fn,
+                    is_first_block=True,
+                )
+            else:
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=True),
+                )
+                O_should_accumulate = True
+            for i in cutlass.range(1, curr_full_block_cnt):
+                full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                kv_consumer_state = mma_one_n_block(
+                    kv_consumer_state,
+                    n_block=full_n_block,
+                    seqlen=seqlen,
+                    mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                    mask_fn=partial(mask_fn, mask_mod=None, mask_seqlen=False),
+                )
+                O_should_accumulate = True
+        if curr_mask_block_cnt + curr_full_block_cnt > 0:
+            kv_consumer_state = process_last_half_block(
+                kv_consumer_state=kv_consumer_state,
+                zero_init=not O_should_accumulate,
+            )
+            O_should_accumulate = True
+    return kv_consumer_state, O_should_accumulate, processed_any
+@cute.jit
+def load_block_list_sm100(
+    block_indices: cute.Tensor,
+    block_count,
+    load_q_with_first: cutlass.Constexpr,
+    q_stage: cutlass.Constexpr,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_kv,
+):
+    """SM100 version of load_block_list (no intra_wg_overlap, no extra_tx_count)."""
+    if block_count > 0:
+        # First iteration: load Q alongside K if requested
+        n_block_first = block_indices[block_count - 1]
+        if const_expr(load_q_with_first):
+            # SM100 loads Q0 and optionally Q1
+            load_Q(block=0, stage=0)
+            if const_expr(q_stage == 2):
+                load_Q(block=1, stage=1)
+        # SM100 doesn't use producer_acquire for pipeline_kv in load path
+        # The pipeline barriers are handled inside load_KV
+        load_K(block=n_block_first, producer_state=kv_producer_state, page_idx=None)
+        kv_producer_state.advance()
+        load_V(block=n_block_first, producer_state=kv_producer_state, page_idx=None)
+        kv_producer_state.advance()
+        # Remaining blocks
+        for offset in cutlass.range(1, block_count):
+            n_block = block_indices[block_count - 1 - offset]
+            load_K(block=n_block, producer_state=kv_producer_state, page_idx=None)
+            kv_producer_state.advance()
+            load_V(block=n_block, producer_state=kv_producer_state, page_idx=None)
+            kv_producer_state.advance()
+    return kv_producer_state
+# SM100-specific tile processor using SM100 helpers
+@cute.jit
+def produce_block_sparse_loads_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    kv_producer_state,
+    load_Q,
+    load_K,
+    load_V,
+    pipeline_kv,
+    q_stage: cutlass.Constexpr,
+    q_producer_phase: Int32,
+    qhead_per_kvhead: cutlass.Constexpr,
+    q_subtile_factor: cutlass.Constexpr,
+):
+    """SM100 entry point for sparse block iteration.
+    SM100 uses PipelineTmaUmma which doesn't support extra_tx_count, so we use
+    simplified block processing that just calls producer_acquire without extras.
+    Args:
+        m_block: which tile of m we are processing
+        qhead_per_kvhead: Constexpr pack factor
+    """
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead, q_subtile_factor)
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    mask_empty = curr_mask_block_cnt == 0
+    full_empty = curr_full_block_cnt == 0
+    q_phase_flipped = False
+    if mask_empty:
+        # No masked blocks: process full list with Q loading
+        kv_producer_state = load_block_list_sm100(
+            curr_full_block_idx,
+            curr_full_block_cnt,
+            load_q_with_first=True,
+            q_stage=q_stage,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_kv=pipeline_kv,
+        )
+        q_phase_flipped = not full_empty
+    else:
+        # Process masked blocks with Q loading
+        kv_producer_state = load_block_list_sm100(
+            curr_mask_block_idx,
+            curr_mask_block_cnt,
+            load_q_with_first=True,
+            q_stage=q_stage,
+            kv_producer_state=kv_producer_state,
+            load_Q=load_Q,
+            load_K=load_K,
+            load_V=load_V,
+            pipeline_kv=pipeline_kv,
+        )
+        q_phase_flipped = True
+        if not full_empty:
+            # Process full blocks without Q loading
+            kv_producer_state = load_block_list_sm100(
+                curr_full_block_idx,
+                curr_full_block_cnt,
+                load_q_with_first=False,
+                q_stage=q_stage,
+                kv_producer_state=kv_producer_state,
+                load_Q=load_Q,
+                load_K=load_K,
+                load_V=load_V,
+                pipeline_kv=pipeline_kv,
+            )
+    if q_phase_flipped:
+        q_producer_phase ^= 1
+    return kv_producer_state, q_producer_phase
+@cute.jit
+def get_total_block_count(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    qhead_per_kvhead: cutlass.Constexpr,
+    q_subtile_factor: cutlass.Constexpr,
+):
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead, q_subtile_factor)
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    if const_expr(full_block_cnt is not None):
+        return (
+            mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+            + full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        )
+    else:
+        return mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+@cute.jit
+def handle_block_sparse_empty_tile_correction_sm100(
+    tidx: Int32,
+    q_stage: cutlass.Constexpr,
+    m_block_size: cutlass.Constexpr,
+    qhead_per_kvhead,
+    pack_gqa: cutlass.Constexpr,
+    is_split_kv: cutlass.Constexpr,
+    learnable_sink,
+    mLSE,
+    seqlen,
+    m_block: Int32,
+    head_idx: Int32,
+    batch_idx: Int32,
+    split_idx: Int32,
+    sScale: cute.Tensor,
+    stats: list,
+    correction_epilogue: Callable,
+    thr_mma_pv: cute.core.ThrMma,
+    tOtO: cute.Tensor,
+    sO: cute.Tensor,
+    pipeline_sm_stats: cutlass.pipeline.PipelineAsync,
+    sm_stats_barrier: cutlass.pipeline.NamedBarrier,
+    pipeline_o_epi: cutlass.pipeline.PipelineAsync,
+    sm_stats_consumer_phase: Int32,
+    o_corr_consumer_phase: Int32,
+    corr_epi_producer_phase: Int32,
+    softmax_scale_log2: Float32,
+    mO_cur: Optional[cute.Tensor] = None,
+    gO: Optional[cute.Tensor] = None,
+    gmem_tiled_copy_O: Optional[cute.TiledCopy] = None,
+):
+    """Handle SM100 forward block-sparse tiles with no active KV blocks.
+    This path is taken when `total_block_cnt == 0`. The softmax warp-group still
+    arrives `mbar_softmax_corr_full` (synthetic "no work") so the correction
+    warp-group can:
+    - seed fully-masked-row stats (row_sum=1; row_max=-inf when tracked) for LSE
+    - run `correction_epilogue` with `scale=0` so the output tile is written as zeros
+      (independent of any prior tmem contents)
+    - wait on `mbar_softmax_corr_full` and arrive `mbar_softmax_corr_empty`
+      (and `mbar_corr_epi_*` when applicable) so phases stay aligned across tiles
+    This helper intentionally does not touch `mbar_P_full_*` since no P is produced.
+    See NOTE [SM100 block-sparse empty tiles: mbarrier contract].
+    """
+    LOG2_E = Float32(math.log2(math.e))
+    warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
+    for stage in cutlass.range_constexpr(q_stage):
+        row_sum_value = Float32(1.0)
+        row_max_value = (
+            -Float32.inf if const_expr(mLSE is not None or learnable_sink is not None) else None
+        )
+        if const_expr(learnable_sink is not None):
+            sink_val = -Float32.inf
+            if const_expr(not pack_gqa):
+                sink_val = Float32(learnable_sink[head_idx])
+            elif tidx < m_block_size:
+                q_head_idx = (
+                    (q_stage * m_block + stage) * m_block_size + tidx
+                ) % qhead_per_kvhead + head_idx * qhead_per_kvhead
+                sink_val = Float32(learnable_sink[q_head_idx])
+            if sink_val != -Float32.inf and (const_expr(not is_split_kv) or split_idx == 0):
+                if row_max_value == -Float32.inf:
+                    row_max_value = sink_val * (LOG2_E / softmax_scale_log2)
+                    row_sum_value = Float32(1.0)
+                else:
+                    row_sum_value = row_sum_value + cute.math.exp2(
+                        sink_val * LOG2_E - row_max_value * softmax_scale_log2, fastmath=True
+                    )
+        if tidx < m_block_size:
+            scale_row_idx = tidx + stage * m_block_size
+            sScale[scale_row_idx] = row_sum_value
+            if const_expr(mLSE is not None or learnable_sink is not None):
+                sScale[scale_row_idx + q_stage * m_block_size] = row_max_value
+        acc_flag = row_sum_value == Float32(0.0) or row_sum_value != row_sum_value
+        stats[stage] = (row_sum_value, row_max_value, acc_flag)
+        # See NOTE [SM100 block-sparse empty tiles: mbarrier contract].
+        # pipeline_sm_stats.consumer_wait_w_index_phase(stage, sm_stats_consumer_phase)
+        sm_stats_barrier.arrive_and_wait_w_index(index=stage * 4 + warp_idx)
+        pipeline_sm_stats.consumer_release_w_index(stage)
+        if const_expr(gmem_tiled_copy_O is None):
+            pipeline_o_epi.producer_acquire_w_index_phase(stage, corr_epi_producer_phase)
+        correction_epilogue(
+            thr_mma_pv,
+            tOtO[None, None, None, stage],
+            tidx,
+            stage,
+            m_block,
+            seqlen.seqlen_q,
+            Float32(0.0),  # zero scale ensures empty tile writes zeros into staged outputs
+            sO[None, None, stage],
+            mO_cur,
+            gO[None, None, stage],
+            gmem_tiled_copy_O,
+        )
+        if const_expr(gmem_tiled_copy_O is None):
+            pipeline_o_epi.producer_commit_w_index(stage)
+    sm_stats_consumer_phase ^= 1
+    corr_epi_producer_phase ^= 1
+    return (
+        sm_stats_consumer_phase,
+        o_corr_consumer_phase,
+        corr_epi_producer_phase,
+    )
+@cute.jit
+def softmax_block_sparse_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    m_block,
+    softmax_step: Callable,
+    mask_fn: Callable,
+    mask_fn_none: Callable,
+    mma_si_consumer_phase: Int32,
+    si_corr_producer_phase: Int32,
+    s0_s1_sequence_phase: Int32,
+    pipeline_sm_stats: cutlass.pipeline.PipelineAsync,
+    sm_stats_barrier: cutlass.pipeline.NamedBarrier,
+    q_stage: cutlass.Constexpr,
+    stage_idx: Int32,
+    check_m_boundary: bool,
+    qhead_per_kvhead: cutlass.Constexpr,
+    q_subtile_factor: cutlass.Constexpr[int] = 1,
+):
+    warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
+    m_block_sparse = sparse_tensor_m_block(m_block, qhead_per_kvhead, q_subtile_factor)
+    mask_block_cnt, mask_block_idx, full_block_cnt, full_block_idx = blocksparse_tensors
+    curr_mask_block_cnt = mask_block_cnt[batch_idx, head_idx, m_block_sparse]
+    curr_mask_block_idx = mask_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    if const_expr(full_block_cnt is not None):
+        curr_full_block_cnt = full_block_cnt[batch_idx, head_idx, m_block_sparse]
+        curr_full_block_idx = full_block_idx[batch_idx, head_idx, m_block_sparse, None]
+    else:
+        curr_full_block_cnt = Int32(0)
+        curr_full_block_idx = None
+    total_block_cnt = curr_mask_block_cnt + curr_full_block_cnt
+    if total_block_cnt == 0:
+        # See NOTE [SM100 block-sparse empty tiles: mbarrier contract].
+        # pipeline_sm_stats.producer_commit_w_index(stage_idx)
+        sm_stats_barrier.arrive_w_index(index=stage_idx * 4 + warp_idx)
+    else:
+        if curr_mask_block_cnt > 0:
+            mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1]
+            (
+                mma_si_consumer_phase,
+                si_corr_producer_phase,
+                s0_s1_sequence_phase,
+            ) = softmax_step(
+                mma_si_consumer_phase,
+                si_corr_producer_phase,
+                s0_s1_sequence_phase,
+                mask_n_block,
+                is_first=True,
+                mask_fn=partial(mask_fn, mask_seqlen=True, check_q_boundary=check_m_boundary),
+            )
+            for i in cutlass.range(1, curr_mask_block_cnt):
+                mask_n_block = curr_mask_block_idx[curr_mask_block_cnt - 1 - i]
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    mask_n_block,
+                    mask_fn=partial(mask_fn, mask_seqlen=False, check_q_boundary=check_m_boundary),
+                )
+        if curr_full_block_cnt > 0:
+            full_n_block = curr_full_block_idx[curr_full_block_cnt - 1]
+            if curr_mask_block_cnt == 0:
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    is_first=True,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=True, check_q_boundary=check_m_boundary
+                    ),
+                )
+            else:
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    is_first=False,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=False, check_q_boundary=check_m_boundary
+                    ),
+                )
+            for i in cutlass.range(1, curr_full_block_cnt):
+                full_n_block = curr_full_block_idx[curr_full_block_cnt - 1 - i]
+                (
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                ) = softmax_step(
+                    mma_si_consumer_phase,
+                    si_corr_producer_phase,
+                    s0_s1_sequence_phase,
+                    full_n_block,
+                    mask_fn=partial(
+                        mask_fn_none, mask_seqlen=False, check_q_boundary=check_m_boundary
+                    ),
+                )
+    return (
+        mma_si_consumer_phase,
+        si_corr_producer_phase,
+        s0_s1_sequence_phase,
+        total_block_cnt == 0,
+    )
+# =============================================================================
+# Backward-specific block-sparse helpers (SM100)
+# =============================================================================
+#
+# In backward, iteration is transposed compared to forward:
+# - Forward: outer loop over m_blocks (Q tiles), inner loop over n_blocks (KV tiles)
+# - Backward: outer loop over n_blocks (KV tiles), inner loop over m_blocks (Q tiles)
+#
+# The backward block-sparse tensors use "Q direction" indexing:
+# - q_block_cnt[batch, head, n_block] → count of m_blocks to process for this KV tile
+# - q_block_idx[batch, head, n_block, :] → indices of m_blocks to process
+#
+@cute.jit
+def get_total_q_block_count_bwd(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Count total tile iterations for given n_block (KV tile) in backward."""
+    q_block_cnt, _, full_block_cnt, _ = blocksparse_tensors
+    total = q_block_cnt[batch_idx, head_idx, n_block]
+    if const_expr(full_block_cnt is not None):
+        total = total + full_block_cnt[batch_idx, head_idx, n_block]
+    return total * subtile_factor
+@cute.jit
+def produce_block_sparse_q_loads_bwd_sm100(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    # Pipeline states (will be returned after advancing)
+    producer_state_Q_LSE,
+    producer_state_dO_dPsum,
+    # Pipelines
+    pipeline_Q,
+    pipeline_LSE,
+    pipeline_dO,
+    pipeline_dPsum,
+    # Load functions
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    copy_stats,
+    # Global tensors for LSE/dPsum
+    gLSE,
+    sLSE,
+    gdPsum,
+    sdPsum,
+    # TMA copy bytes for extra_tx_count
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    # Flags for which loads to perform
+    should_load_Q: cutlass.Constexpr,
+    should_load_dO: cutlass.Constexpr,
+    # Subtiling factor and bounds
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """SM100 backward block sparse loading with subtiling.
+    Returns updated (producer_state_Q_LSE, producer_state_dO_dPsum).
+    First iteration loads K/V alongside Q/dO; subsequent iterations load only Q/dO.
+    """
+    (
+        curr_q_cnt,
+        curr_q_idx,
+        curr_full_cnt,
+        curr_full_idx,
+        loop_count,
+    ) = get_block_sparse_iteration_info_bwd(
+        blocksparse_tensors, batch_idx, head_idx, n_block, subtile_factor, m_block_max
+    )
+    for iter_idx in cutlass.range(loop_count, unroll=1):
+        m_block, _ = get_m_block_from_iter_bwd(
+            iter_idx,
+            curr_q_cnt,
+            curr_q_idx,
+            curr_full_cnt,
+            curr_full_idx,
+            subtile_factor,
+            m_block_max,
+        )
+        m_block_safe = m_block
+        if m_block_max > 0:
+            m_block_safe = cutlass.min(m_block, m_block_max - 1)
+        if iter_idx == 0:
+            # First block: load K/V alongside Q/dO
+            if const_expr(should_load_Q):
+                pipeline_Q.producer_acquire(producer_state_Q_LSE, extra_tx_count=tma_copy_bytes_K)
+                load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q_LSE))
+                load_Q(m_block_safe, producer_state=producer_state_Q_LSE)
+                pipeline_Q.producer_commit(producer_state_Q_LSE)
+                pipeline_LSE.producer_acquire(producer_state_Q_LSE)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gLSE[None, m_block_safe],
+                        sLSE[None, producer_state_Q_LSE.index],
+                        mbar_ptr=pipeline_LSE.producer_get_barrier(producer_state_Q_LSE),
+                    )
+                producer_state_Q_LSE.advance()
+            if const_expr(should_load_dO):
+                pipeline_dO.producer_acquire(
+                    producer_state_dO_dPsum, extra_tx_count=tma_copy_bytes_V
+                )
+                load_V(tma_bar_ptr=pipeline_dO.producer_get_barrier(producer_state_dO_dPsum))
+                load_dO(m_block_safe, producer_state=producer_state_dO_dPsum)
+                pipeline_dO.producer_commit(producer_state_dO_dPsum)
+                pipeline_dPsum.producer_acquire(producer_state_dO_dPsum)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gdPsum[None, m_block_safe],
+                        sdPsum[None, producer_state_dO_dPsum.index],
+                        mbar_ptr=pipeline_dPsum.producer_get_barrier(producer_state_dO_dPsum),
+                    )
+                producer_state_dO_dPsum.advance()
+        else:
+            # Subsequent blocks: just load Q/dO (K/V already loaded)
+            if const_expr(should_load_Q):
+                pipeline_Q.producer_acquire(producer_state_Q_LSE)
+                load_Q(m_block_safe, producer_state=producer_state_Q_LSE)
+                pipeline_Q.producer_commit(producer_state_Q_LSE)
+                pipeline_LSE.producer_acquire(producer_state_Q_LSE)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gLSE[None, m_block_safe],
+                        sLSE[None, producer_state_Q_LSE.index],
+                        mbar_ptr=pipeline_LSE.producer_get_barrier(producer_state_Q_LSE),
+                    )
+                producer_state_Q_LSE.advance()
+            if const_expr(should_load_dO):
+                pipeline_dO.producer_acquire(producer_state_dO_dPsum)
+                load_dO(m_block_safe, producer_state=producer_state_dO_dPsum)
+                pipeline_dO.producer_commit(producer_state_dO_dPsum)
+                pipeline_dPsum.producer_acquire(producer_state_dO_dPsum)
+                with cute.arch.elect_one():
+                    copy_stats(
+                        gdPsum[None, m_block_safe],
+                        sdPsum[None, producer_state_dO_dPsum.index],
+                        mbar_ptr=pipeline_dPsum.producer_get_barrier(producer_state_dO_dPsum),
+                    )
+                producer_state_dO_dPsum.advance()
+    return producer_state_Q_LSE, producer_state_dO_dPsum
+@cute.jit
+def get_block_sparse_iteration_info_bwd(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Extract block-sparse iteration info for backward pass.
+    Returns (curr_q_cnt, curr_q_idx, curr_full_cnt, curr_full_idx, total_count).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    sparse_block_count = curr_q_cnt
+    if const_expr(full_cnt is not None):
+        sparse_block_count = sparse_block_count + curr_full_cnt
+    total_count = sparse_block_count * subtile_factor
+    return curr_q_cnt, curr_q_idx, curr_full_cnt, curr_full_idx, total_count
+@cute.jit
+def get_m_block_from_iter_bwd(
+    iter_idx,
+    curr_q_cnt,
+    curr_q_idx: cute.Tensor,
+    curr_full_cnt,
+    curr_full_idx: Optional[cute.Tensor],
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+):
+    """Derive m_block index and is_full_block flag from iteration index.
+    Returns (m_block, is_full_block):
+        - m_block: The actual Q-tile block index
+        - is_full_block: True if this is a full block (no mask_mod needed)
+    """
+    sparse_iter_idx = iter_idx // subtile_factor
+    subtile_offset = iter_idx % subtile_factor
+    sparse_m_block = Int32(0)
+    is_full_block = False
+    if const_expr(curr_full_idx is not None):
+        if sparse_iter_idx < curr_q_cnt:
+            sparse_m_block = curr_q_idx[sparse_iter_idx]
+        else:
+            sparse_m_block = curr_full_idx[sparse_iter_idx - curr_q_cnt]
+            is_full_block = True
+    else:
+        sparse_m_block = curr_q_idx[sparse_iter_idx]
+    return sparse_m_block * subtile_factor + subtile_offset, is_full_block
+@cute.jit
+def _load_q_do_block_sm90(
+    m_block,
+    producer_state_Q,
+    producer_state_dO,
+    pipeline_Q,
+    pipeline_dO,
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    load_LSE,
+    load_dPsum,
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    Q_stage_eq_dO_stage: cutlass.Constexpr,
+    load_kv: bool,
+):
+    """Load one Q/dO block, optionally loading K/V on first iteration."""
+    if load_kv:
+        pipeline_Q.producer_acquire(producer_state_Q, extra_tx_count=tma_copy_bytes_K)
+        load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q))
+    else:
+        pipeline_Q.producer_acquire(producer_state_Q)
+    load_Q(m_block, producer_state=producer_state_Q)
+    load_LSE(m_block, producer_state=producer_state_Q)
+    producer_state_dO_cur = (
+        producer_state_dO if const_expr(not Q_stage_eq_dO_stage) else producer_state_Q
+    )
+    if load_kv:
+        pipeline_dO.producer_acquire(producer_state_dO_cur, extra_tx_count=tma_copy_bytes_V)
+        load_V(tma_bar_ptr=pipeline_dO.producer_get_barrier(producer_state_dO_cur))
+    else:
+        pipeline_dO.producer_acquire(producer_state_dO_cur)
+    load_dO(m_block, producer_state=producer_state_dO_cur)
+    load_dPsum(m_block, producer_state=producer_state_dO_cur)
+    producer_state_Q.advance()
+    producer_state_dO.advance()
+    return producer_state_Q, producer_state_dO
+@cute.jit
+def produce_block_sparse_q_loads_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    producer_state_Q,
+    producer_state_dO,
+    pipeline_Q,
+    pipeline_dO,
+    load_K,
+    load_V,
+    load_Q,
+    load_dO,
+    load_LSE,
+    load_dPsum,
+    tma_copy_bytes_K,
+    tma_copy_bytes_V,
+    Q_stage_eq_dO_stage: cutlass.Constexpr,
+    subtile_factor: cutlass.Constexpr,
+    m_block_max: int,
+):
+    """SM90 backward block sparse loading with separate partial/full loops.
+    K/V are loaded with the first valid block. Iterates partial blocks first,
+    then full blocks, matching consumer order.
+    Returns updated (producer_state_Q, producer_state_dO).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    kv_loaded = False
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
+                m_block,
+                producer_state_Q,
+                producer_state_dO,
+                pipeline_Q,
+                pipeline_dO,
+                load_K,
+                load_V,
+                load_Q,
+                load_dO,
+                load_LSE,
+                load_dPsum,
+                tma_copy_bytes_K,
+                tma_copy_bytes_V,
+                Q_stage_eq_dO_stage,
+                load_kv=not kv_loaded,
+            )
+            kv_loaded = True
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
+                    m_block,
+                    producer_state_Q,
+                    producer_state_dO,
+                    pipeline_Q,
+                    pipeline_dO,
+                    load_K,
+                    load_V,
+                    load_Q,
+                    load_dO,
+                    load_LSE,
+                    load_dPsum,
+                    tma_copy_bytes_K,
+                    tma_copy_bytes_V,
+                    Q_stage_eq_dO_stage,
+                    load_kv=not kv_loaded,
+                )
+                kv_loaded = True
+    return producer_state_Q, producer_state_dO
+@cute.jit
+def consume_block_sparse_mma_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    consumer_state_Q,
+    consumer_state_dO,
+    mma_one_m_block_fn,
+    mask,
+    mask_mod,
+    is_causal: cutlass.Constexpr,
+    is_local: cutlass.Constexpr,
+    thr_mma_SdP,
+    score_mod_fn=None,
+    score_mod_bwd_fn=None,
+    subtile_factor: cutlass.Constexpr = 1,
+    m_block_max: int = 0,
+    aux_tensors=None,
+    fastdiv_mods=(None, None),
+):
+    """SM90 backward block sparse MMA consumption with separate partial/full loops.
+    Partial blocks are processed first (with mask_mod applied), then full blocks
+    (without mask_mod). This ensures mask_mod is only applied where needed.
+    Returns updated (consumer_state_Q, consumer_state_dO).
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    dKV_accumulate = False
+    mask_fn_partial = partial(
+        mask.apply_mask,
+        batch_idx=batch_idx,
+        head_idx=head_idx,
+        n_block=n_block,
+        thr_mma=thr_mma_SdP,
+        mask_seqlen=True,
+        mask_causal=is_causal,
+        mask_local=is_local,
+        mask_mod=mask_mod,
+        aux_tensors=aux_tensors,
+        fastdiv_mods=fastdiv_mods,
+    )
+    mask_fn_full = partial(
+        mask.apply_mask,
+        batch_idx=batch_idx,
+        head_idx=head_idx,
+        n_block=n_block,
+        thr_mma=thr_mma_SdP,
+        mask_seqlen=True,
+        mask_causal=is_causal,
+        mask_local=is_local,
+        aux_tensors=aux_tensors,
+        fastdiv_mods=fastdiv_mods,
+    )
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
+                m_block,
+                consumer_state_Q,
+                consumer_state_dO,
+                mask_fn=mask_fn_partial,
+                score_mod_fn=score_mod_fn,
+                score_mod_bwd_fn=score_mod_bwd_fn,
+                dKV_accumulate=dKV_accumulate,
+            )
+            dKV_accumulate = True
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
+                    m_block,
+                    consumer_state_Q,
+                    consumer_state_dO,
+                    mask_fn=mask_fn_full,
+                    score_mod_fn=score_mod_fn,
+                    score_mod_bwd_fn=score_mod_bwd_fn,
+                    dKV_accumulate=dKV_accumulate,
+                )
+                dKV_accumulate = True
+    return consumer_state_Q, consumer_state_dO
+@cute.jit
+def _store_one_dQaccum_sm90(
+    m_block,
+    sdQaccum: cute.Tensor,
+    gdQaccum: cute.Tensor,
+    num_mma_warp_groups: cutlass.Constexpr,
+    num_threads_per_warp_group: cutlass.Constexpr,
+    tma_copy_bytes_dQ,
+):
+    """Store dQaccum for a single m_block."""
+    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
+        cute.arch.cp_async_bulk_wait_group(num_mma_warp_groups - 1 - warp_group_idx, read=True)
+        cute.arch.barrier_arrive(
+            barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
+            number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
+        cute.arch.barrier(
+            barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
+            number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+        with cute.arch.elect_one():
+            copy_utils.cpasync_reduce_bulk_add_f32(
+                sdQaccum[None, warp_group_idx].iterator,
+                gdQaccum[None, warp_group_idx, m_block].iterator,
+                tma_copy_bytes_dQ,
+            )
+        cute.arch.cp_async_bulk_commit_group()
+@cute.jit
+def dQaccum_store_block_sparse_bwd_sm90(
+    blocksparse_tensors: BlockSparseTensors,
+    batch_idx,
+    head_idx,
+    n_block,
+    sdQaccum: cute.Tensor,
+    gdQaccum: cute.Tensor,
+    subtile_factor: cutlass.Constexpr,
+    m_block_max: int,
+    num_mma_warp_groups: cutlass.Constexpr,
+    num_threads_per_warp_group: cutlass.Constexpr,
+    tma_copy_bytes_dQ,
+):
+    """SM90 backward block sparse dQaccum store with separate partial/full loops.
+    Iterates partial blocks first, then full blocks, matching producer/consumer order.
+    """
+    q_cnt, q_idx, full_cnt, full_idx = blocksparse_tensors
+    curr_q_cnt = q_cnt[batch_idx, head_idx, n_block]
+    curr_q_idx = q_idx[batch_idx, head_idx, n_block, None]
+    if const_expr(full_cnt is not None):
+        curr_full_cnt = full_cnt[batch_idx, head_idx, n_block]
+        curr_full_idx = full_idx[batch_idx, head_idx, n_block, None]
+    else:
+        curr_full_cnt = Int32(0)
+        curr_full_idx = None
+    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
+        sparse_idx = iter_idx // subtile_factor
+        subtile_offset = iter_idx % subtile_factor
+        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+        if m_block < m_block_max:
+            _store_one_dQaccum_sm90(
+                m_block,
+                sdQaccum,
+                gdQaccum,
+                num_mma_warp_groups,
+                num_threads_per_warp_group,
+                tma_copy_bytes_dQ,
+            )
+    if const_expr(full_cnt is not None):
+        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
+            sparse_idx = iter_idx // subtile_factor
+            subtile_offset = iter_idx % subtile_factor
+            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+            if m_block < m_block_max:
+                _store_one_dQaccum_sm90(
+                    m_block,
+                    sdQaccum,
+                    gdQaccum,
+                    num_mma_warp_groups,
+                    num_threads_per_warp_group,
+                    tma_copy_bytes_dQ,
+                )

build/torch-cuda/block_sparsity.py ADDED Viewed

	@@ -0,0 +1,440 @@

+"""
+Block-sparsity utilities for FlexAttention
+"""
+from typing import Callable, NamedTuple, Tuple
+import cutlass.cute as cute
+import torch
+from .cute_dsl_utils import get_broadcast_dims, to_cute_tensor
+def ceildiv(a: int, b: int) -> int:
+    return (a + b - 1) // b
+class BlockSparseTensors(NamedTuple):
+    mask_block_cnt: cute.Tensor
+    mask_block_idx: cute.Tensor
+    full_block_cnt: cute.Tensor | None
+    full_block_idx: cute.Tensor | None
+    def __new_from_mlir_values__(self, values):
+        if len(values) == 2:
+            values = (*values, None, None)
+        return BlockSparseTensors(*values)
+class BlockSparseTensorsTorch(NamedTuple):
+    mask_block_cnt: torch.Tensor
+    mask_block_idx: torch.Tensor
+    full_block_cnt: torch.Tensor | None = None
+    full_block_idx: torch.Tensor | None = None
+    block_size: tuple[int, int] | None = None
+def _expand_sparsity_tensor(
+    tensor: torch.Tensor,
+    expected_shape: Tuple[int, ...],
+    tensor_name: str,
+    context: str | None,
+    hint: str | Callable[[], str] | None,
+) -> torch.Tensor:
+    """Check if we need to expand the tensor to expected shape, and do so if possible."""
+    needs_expand = tensor.shape != expected_shape
+    if not needs_expand:
+        return tensor
+    can_expand = all(map(lambda cur, tgt: cur == tgt or cur == 1, tensor.shape, expected_shape))
+    if not can_expand:
+        context_clause = f" ({context})" if context else ""
+        resolved_hint = hint() if callable(hint) else hint
+        hint_clause = f" Hint: {resolved_hint}" if resolved_hint else ""
+        raise ValueError(
+            f"{tensor_name}{context_clause} with shape {tensor.shape} cannot be expanded to expected shape {expected_shape}."
+            f"{hint_clause}"
+        )
+    return tensor.expand(*expected_shape)
+def _check_and_expand_block(
+    name: str,
+    cnt: torch.Tensor | None,
+    idx: torch.Tensor | None,
+    expected_count_shape: Tuple[int, int, int],
+    expected_index_shape: Tuple[int, int, int, int],
+    context: str | None,
+    hint: str | Callable[[], str] | None,
+) -> Tuple[torch.Tensor | None, torch.Tensor | None]:
+    if (cnt is None) != (idx is None):
+        raise ValueError(
+            f"{name}_block_cnt and {name}_block_idx must both be provided or both be None"
+        )
+    if cnt is None or idx is None:
+        return None, None
+    if cnt.dtype != torch.int32 or idx.dtype != torch.int32:
+        raise ValueError(f"{name}_block tensors must have dtype torch.int32")
+    if cnt.device != idx.device:
+        raise ValueError(f"{name}_block_cnt and {name}_block_idx must be on the same device")
+    if not cnt.is_cuda or not idx.is_cuda:
+        raise ValueError(f"{name}_block tensors must live on CUDA")
+    expanded_cnt = _expand_sparsity_tensor(
+        cnt, expected_count_shape, f"{name}_block_cnt", context, hint
+    )
+    expanded_idx = _expand_sparsity_tensor(
+        idx, expected_index_shape, f"{name}_block_idx", context, hint
+    )
+    return expanded_cnt, expanded_idx
+def get_block_sparse_expected_shapes(
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    m_block_size: int,
+    n_block_size: int,
+    q_stage: int,
+) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int]]:
+    """Return (expected_count_shape, expected_index_shape) for block sparse normalization."""
+    m_block_size_effective = q_stage * m_block_size
+    expected_m_blocks = ceildiv(seqlen_q, m_block_size_effective)
+    expected_n_blocks = ceildiv(seqlen_k, n_block_size)
+    expected_count_shape = (batch_size, num_head, expected_m_blocks)
+    expected_index_shape = (batch_size, num_head, expected_m_blocks, expected_n_blocks)
+    return expected_count_shape, expected_index_shape
+def infer_block_sparse_expected_shapes(
+    tensors: BlockSparseTensorsTorch,
+    *,
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    m_block_size: int,
+    n_block_size: int,
+    q_stage: int,
+    context: str,
+    sparse_block_size_q: int | None = None,
+    sparse_block_size_kv: int | None = None,
+) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int], int]:
+    """Infer shapes and scaling for block-sparse tensors.
+    Expectations:
+    - mask_block_cnt is (B, H, M) and mask_block_idx is (B, H, M, N).
+    - Batch/head dims may be 1 for broadcast, or match the requested sizes.
+    - sparse_block_size_kv must match tile_n.
+    - sparse_block_size_q must be a multiple of q_stage * tile_m.
+    - If sparse_block_size_q is omitted and seqlen_q/num_m_blocks is ambiguous,
+      the caller must provide block_size to disambiguate. TODO will make this required in a future PR.
+    """
+    base_m_block = q_stage * m_block_size
+    base_n_block = n_block_size
+    if sparse_block_size_kv is None:
+        sparse_block_size_kv = base_n_block
+    if sparse_block_size_kv != base_n_block:
+        raise ValueError(f"Block sparse tensors{context} require BLOCK_SIZE_KV={base_n_block}.")
+    if tensors.mask_block_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    num_m_blocks = tensors.mask_block_idx.shape[2]
+    if sparse_block_size_q is None:
+        min_block_size = ceildiv(seqlen_q, num_m_blocks)
+        if num_m_blocks == 1:
+            max_block_size = seqlen_q
+        else:
+            max_block_size = (seqlen_q - 1) // (num_m_blocks - 1)
+        if max_block_size != min_block_size and base_m_block != 1:
+            raise ValueError(
+                f"Block sparse tensors{context} require explicit sparse_block_size[0] "
+                f"to disambiguate block size for seqlen_q={seqlen_q} and num_m_blocks={num_m_blocks}."
+            )
+        sparse_block_size_q = min_block_size
+    if sparse_block_size_q % base_m_block != 0:
+        raise ValueError(
+            f"Block sparse tensors{context} have block size {sparse_block_size_q}, "
+            f"which must be a multiple of {base_m_block}."
+        )
+    expected_m_blocks = ceildiv(seqlen_q, sparse_block_size_q)
+    expected_n_blocks = ceildiv(seqlen_k, sparse_block_size_kv)
+    q_subtile_factor = sparse_block_size_q // base_m_block
+    expected_count_shape = (batch_size, num_head, expected_m_blocks)
+    expected_index_shape = (batch_size, num_head, expected_m_blocks, expected_n_blocks)
+    mask_block_cnt = tensors.mask_block_cnt
+    mask_block_idx = tensors.mask_block_idx
+    if mask_block_cnt is None or mask_block_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    if mask_block_cnt.ndim != 3 or mask_block_idx.ndim != 4:
+        raise ValueError(
+            f"Block sparse tensors{context} must have shapes (B, H, M) and (B, H, M, N)."
+        )
+    for dim_name, cur, tgt in (
+        ("batch", mask_block_cnt.shape[0], expected_count_shape[0]),
+        ("head", mask_block_cnt.shape[1], expected_count_shape[1]),
+    ):
+        if cur != tgt and cur != 1:
+            raise ValueError(f"Block sparse tensors{context} {dim_name} dim must be {tgt} or 1.")
+    for dim_name, cur, tgt in (
+        ("batch", mask_block_idx.shape[0], expected_index_shape[0]),
+        ("head", mask_block_idx.shape[1], expected_index_shape[1]),
+    ):
+        if cur != tgt and cur != 1:
+            raise ValueError(f"Block sparse tensors{context} {dim_name} dim must be {tgt} or 1.")
+    if mask_block_cnt.shape[2] != mask_block_idx.shape[2]:
+        raise ValueError(f"Block sparse tensors{context} must share the same m-block dimension.")
+    if mask_block_idx.shape[3] != expected_n_blocks:
+        raise ValueError(
+            f"Block sparse tensors{context} n-block dimension must be {expected_n_blocks}."
+        )
+    if expected_m_blocks != num_m_blocks:
+        raise ValueError(
+            f"Block sparse tensors{context} m-block dimension {num_m_blocks} does not match "
+            f"sparse_block_size_q={sparse_block_size_q}. "
+            f"Set BlockSparseTensorsTorch.block_size to match the BlockMask BLOCK_SIZE."
+        )
+    return expected_count_shape, expected_index_shape, q_subtile_factor
+def get_block_sparse_expected_shapes_bwd(
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    m_block_size: int,
+    n_block_size: int,
+    subtile_factor: int,
+) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int]]:
+    """Return (expected_count_shape, expected_index_shape) for backward block sparse normalization.
+    Backward uses Q-direction indexing (transposed from forward), where shapes are
+    indexed by N-blocks first, then M-blocks. The sparse_block_size_q is determined
+    by subtile_factor * m_block_size.
+    """
+    sparse_block_size_q = subtile_factor * m_block_size
+    expected_m_blocks = ceildiv(seqlen_q, sparse_block_size_q)
+    expected_n_blocks = ceildiv(seqlen_k, n_block_size)
+    expected_count_shape = (batch_size, num_head, expected_n_blocks)
+    expected_index_shape = (batch_size, num_head, expected_n_blocks, expected_m_blocks)
+    return expected_count_shape, expected_index_shape
+def normalize_block_sparse_tensors(
+    tensors: BlockSparseTensorsTorch,
+    *,
+    expected_count_shape: Tuple[int, int, int],
+    expected_index_shape: Tuple[int, int, int, int],
+    context: str | None = None,
+    hint: str | Callable[[], str] | None = None,
+) -> BlockSparseTensorsTorch:
+    if tensors.mask_block_cnt is None or tensors.mask_block_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    mask_cnt, mask_idx = _check_and_expand_block(
+        "mask",
+        tensors.mask_block_cnt,
+        tensors.mask_block_idx,
+        expected_count_shape,
+        expected_index_shape,
+        context,
+        hint,
+    )
+    if mask_cnt is None or mask_idx is None:
+        raise ValueError("mask_block_cnt and mask_block_idx must be provided for block sparsity.")
+    full_cnt, full_idx = _check_and_expand_block(
+        "full",
+        tensors.full_block_cnt,
+        tensors.full_block_idx,
+        expected_count_shape,
+        expected_index_shape,
+        context,
+        hint,
+    )
+    if full_cnt is not None and mask_cnt.device != full_cnt.device:
+        raise ValueError("All block sparse tensors must be on the same device")
+    return BlockSparseTensorsTorch(
+        mask_block_cnt=mask_cnt,
+        mask_block_idx=mask_idx,
+        full_block_cnt=full_cnt,
+        full_block_idx=full_idx,
+        block_size=tensors.block_size,
+    )
+def is_block_sparsity_enabled(tensors: BlockSparseTensorsTorch) -> bool:
+    return any(t is not None for t in (tensors.full_block_cnt, tensors.mask_block_cnt))
+def get_block_sparse_broadcast_pattern(
+    tensors: BlockSparseTensorsTorch,
+) -> Tuple[Tuple[bool, ...], ...] | None:
+    """Return broadcast pattern for block sparse tensors by checking actual strides.
+    Returns a tuple of broadcast patterns (one per tensor) where each pattern
+    is a tuple of bools indicating which dims have stride=0.
+    This is used in compile keys to ensure kernels are recompiled when
+    broadcast patterns change, since CuTe's mark_layout_dynamic() keeps
+    stride=0 as static.
+    The tensors should already be expanded/normalized before calling this function.
+    Returns None if block sparsity is not enabled.
+    """
+    if not is_block_sparsity_enabled(tensors):
+        return None
+    patterns = []
+    for tensor in (
+        tensors.mask_block_cnt,
+        tensors.mask_block_idx,
+        tensors.full_block_cnt,
+        tensors.full_block_idx,
+    ):
+        if tensor is not None:
+            patterns.append(get_broadcast_dims(tensor))
+        else:
+            patterns.append(None)
+    return tuple(patterns)
+def normalize_block_sparse_config(
+    tensors: BlockSparseTensorsTorch,
+    *,
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    block_size: tuple[int, int],
+    q_stage: int,
+) -> tuple[BlockSparseTensorsTorch, Tuple[Tuple[bool, ...], ...] | None, int]:
+    m_block_size, n_block_size = block_size
+    if tensors.block_size is None:
+        sparse_block_size_q, sparse_block_size_kv = q_stage * m_block_size, n_block_size
+    else:
+        sparse_block_size_q, sparse_block_size_kv = tensors.block_size
+    if sparse_block_size_kv != n_block_size:
+        raise ValueError(
+            f"Block sparsity requires sparse_block_size[1]={n_block_size} to match tile_n."
+        )
+    expected_count_shape, expected_index_shape, q_subtile_factor = (
+        infer_block_sparse_expected_shapes(
+            tensors,
+            batch_size=batch_size,
+            num_head=num_head,
+            seqlen_q=seqlen_q,
+            seqlen_k=seqlen_k,
+            m_block_size=m_block_size,
+            n_block_size=n_block_size,
+            q_stage=q_stage,
+            context="forward",
+            sparse_block_size_q=sparse_block_size_q,
+            sparse_block_size_kv=sparse_block_size_kv,
+        )
+    )
+    normalized_tensors = normalize_block_sparse_tensors(
+        tensors,
+        expected_count_shape=expected_count_shape,
+        expected_index_shape=expected_index_shape,
+    )
+    return (
+        normalized_tensors,
+        get_block_sparse_broadcast_pattern(normalized_tensors),
+        q_subtile_factor,
+    )
+def normalize_block_sparse_config_bwd(
+    tensors: BlockSparseTensorsTorch,
+    *,
+    batch_size: int,
+    num_head: int,
+    seqlen_q: int,
+    seqlen_k: int,
+    block_size: tuple[int, int],
+    subtile_factor: int,
+) -> tuple[BlockSparseTensorsTorch, Tuple[Tuple[bool, ...], ...] | None]:
+    m_block_size, n_block_size = block_size
+    if tensors.block_size is None:
+        sparse_block_size_q, sparse_block_size_kv = subtile_factor * m_block_size, n_block_size
+    else:
+        sparse_block_size_q, sparse_block_size_kv = tensors.block_size
+    if sparse_block_size_q != subtile_factor * m_block_size:
+        raise ValueError(
+            f"Block sparsity expects sparse_block_size_q={subtile_factor * m_block_size} "
+            f"for subtile_factor={subtile_factor}."
+        )
+    if sparse_block_size_kv != n_block_size:
+        raise ValueError(
+            f"Block sparsity expects sparse_block_size[1]={n_block_size} to match tile_n."
+        )
+    expected_count_shape, expected_index_shape = get_block_sparse_expected_shapes_bwd(
+        batch_size,
+        num_head,
+        seqlen_q,
+        seqlen_k,
+        m_block_size,
+        n_block_size,
+        subtile_factor,
+    )
+    normalized_tensors = normalize_block_sparse_tensors(
+        tensors,
+        expected_count_shape=expected_count_shape,
+        expected_index_shape=expected_index_shape,
+        context="_flash_attn_bwd",
+        hint=lambda: (
+            f"Backward expects Q-direction block-sparse tensors (q_mask_cnt/q_mask_idx, "
+            f"and optionally full_q_cnt/full_q_idx). Regenerate the backward BlockMask with "
+            f"BLOCK_SIZE=({subtile_factor * m_block_size}, {n_block_size})."
+        ),
+    )
+    return normalized_tensors, get_block_sparse_broadcast_pattern(normalized_tensors)
+def to_cute_block_sparse_tensors(
+    tensors: BlockSparseTensorsTorch, enable_tvm_ffi: bool = True
+) -> BlockSparseTensors | None:
+    """Convert torch block sparsity tensors to CuTe tensors, optionally for tvm ffi"""
+    if not is_block_sparsity_enabled(tensors):
+        return None
+    (
+        mask_block_cnt,
+        mask_block_idx,
+        full_block_cnt,
+        full_block_idx,
+        *_,
+    ) = tensors
+    (
+        mask_block_cnt_tensor,
+        mask_block_idx_tensor,
+    ) = [
+        to_cute_tensor(t, assumed_align=4, leading_dim=-1, enable_tvm_ffi=enable_tvm_ffi)
+        for t in (mask_block_cnt, mask_block_idx)
+    ]
+    (
+        full_block_cnt_tensor,
+        full_block_idx_tensor,
+    ) = [
+        to_cute_tensor(t, assumed_align=4, leading_dim=-1, enable_tvm_ffi=enable_tvm_ffi)
+        if t is not None
+        else None
+        for t in (full_block_cnt, full_block_idx)
+    ]
+    return BlockSparseTensors(
+        mask_block_cnt_tensor,
+        mask_block_idx_tensor,
+        full_block_cnt_tensor,
+        full_block_idx_tensor,
+    )
+def fast_sampling(mask_mod):
+    """Convenience decorator to mark mask_mod as safe for 5-point fast sampling"""
+    mask_mod.use_fast_sampling = True
+    return mask_mod

build/torch-cuda/cache_utils.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# Manage Ahead-of-Time (AOT) compiled kernels
+import fcntl
+import hashlib
+import logging
+import os
+import pickle
+import sys
+import tempfile
+import time
+from distutils.ccompiler import CCompiler, new_compiler
+from functools import lru_cache
+from getpass import getuser
+from pathlib import Path
+from typing import Hashable, TypeAlias
+import cutlass
+import cutlass.cute as cute
+import tvm_ffi
+from cutlass.cutlass_dsl import JitCompiledFunction
+CompileKeyType: TypeAlias = tuple[Hashable, ...]
+CallableFunction: TypeAlias = JitCompiledFunction | tvm_ffi.Function
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.WARNING)
+# Enable cache via `FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED=1`
+CUTE_DSL_CACHE_ENABLED: bool = os.getenv("FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED", "0") == "1"
+# Customize cache dir via `FLASH_ATTENTION_CUTE_DSL_CACHE_DIR`, default is
+# `/tmp/${USER}/flash_attention_cute_dsl_cache``
+CUTE_DSL_CACHE_DIR: str | None = os.getenv("FLASH_ATTENTION_CUTE_DSL_CACHE_DIR", None)
+def get_cache_path() -> Path:
+    if CUTE_DSL_CACHE_DIR is not None:
+        cache_dir = Path(CUTE_DSL_CACHE_DIR)
+    else:
+        cache_dir = Path(tempfile.gettempdir()) / getuser() / "flash_attention_cute_dsl_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+@lru_cache(maxsize=1)
+def _compute_source_fingerprint() -> str:
+    """
+    Hash all CuTe Python sources plus runtime ABI stamps into a short fingerprint.
+    The fingerprint changes whenever:
+    - Any .py file under flash_attn/cute is added, removed, renamed, or modified.
+    - The Python minor version changes (e.g. 3.13 -> 3.14).
+    - The cutlass or tvm_ffi package version changes.
+    Computed once per process and cached.
+    """
+    cute_root = Path(__file__).resolve().parent
+    h = hashlib.sha256()
+    h.update(f"py{sys.version_info.major}.{sys.version_info.minor}".encode())
+    h.update(f"cutlass={cutlass.__version__}".encode())
+    h.update(f"tvm_ffi={tvm_ffi.__version__}".encode())
+    for src in sorted(cute_root.rglob("*.py")):
+        h.update(src.relative_to(cute_root).as_posix().encode())
+        content = src.read_bytes()
+        h.update(len(content).to_bytes(8, "little"))
+        h.update(content)
+    return h.hexdigest()
+class FileLock:
+    """Context manager for advisory file locks using fcntl.flock.
+    Supports exclusive (write) and shared (read) locks.
+    Always blocks with polling until the lock is acquired or timeout is reached.
+    Usage:
+        with FileLock(lock_path, exclusive=True, timeout=15, label="abc"):
+            # do work under lock
+    """
+    def __init__(
+        self,
+        lock_path: Path,
+        exclusive: bool,
+        timeout: float = 15,
+        label: str = "",
+    ):
+        """
+        Args:
+            lock_path: Path to the lock file on disk.
+            exclusive: True for exclusive (write) lock, False for shared (read) lock.
+            timeout: Max seconds to wait for lock acquisition before raising RuntimeError.
+            label: Optional human-readable label for error messages.
+        """
+        self.lock_path: Path = lock_path
+        self.exclusive: bool = exclusive
+        self.timeout: float = timeout
+        self.label: str = label
+        self._fd: int = -1
+    @property
+    def _lock_label(self) -> str:
+        kind = "exclusive" if self.exclusive else "shared"
+        return f"{kind} {self.label}" if self.label else kind
+    def __enter__(self) -> "FileLock":
+        open_flags = (
+            os.O_WRONLY | os.O_CREAT if self.exclusive else os.O_RDONLY | os.O_CREAT
+        )
+        lock_type = fcntl.LOCK_EX if self.exclusive else fcntl.LOCK_SH
+        self._fd = os.open(str(self.lock_path), open_flags)
+        deadline = time.monotonic() + self.timeout
+        acquired = False
+        while time.monotonic() < deadline:
+            try:
+                fcntl.flock(self._fd, lock_type | fcntl.LOCK_NB)
+                acquired = True
+                break
+            except OSError:
+                time.sleep(0.1)
+        if not acquired:
+            os.close(self._fd)
+            self._fd = None
+            raise RuntimeError(
+                f"Timed out after {self.timeout}s waiting for "
+                f"{self._lock_label} lock: {self.lock_path}"
+            )
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._fd is not None:
+            fcntl.flock(self._fd, fcntl.LOCK_UN)
+            os.close(self._fd)
+            self._fd = None
+class JITCache:
+    """
+    In-memory cache for compiled functions.
+    """
+    def __init__(self):
+        self.cache: dict[CompileKeyType, CallableFunction] = {}
+    def __setitem__(self, key: CompileKeyType, fn: JitCompiledFunction) -> None:
+        self.cache[key] = fn
+    def __getitem__(self, key: CompileKeyType) -> CallableFunction:
+        return self.cache[key]
+    def __contains__(self, key: CompileKeyType) -> bool:
+        return key in self.cache
+    def clear(self) -> None:
+        """
+        Clear in-memory cache of compiled functions
+        """
+        self.cache.clear()
+class JITPersistentCache(JITCache):
+    """
+    In-memory cache for compiled functions, which is also backed by persistent storage.
+    Use cutedsl ahead-of-time (AOT) compilation, only supporting enable_tvm_ffi=True
+    """
+    EXPORT_FUNCTION_PREFIX = "func"
+    LOCK_TIMEOUT_SECONDS = 15
+    _compiler: CCompiler | None = None
+    def __init__(self, cache_path: Path):
+        super().__init__()
+        cache_path.mkdir(parents=True, exist_ok=True)
+        self.cache_path: Path = cache_path
+    def __setitem__(self, key: CompileKeyType, fn: JitCompiledFunction) -> None:
+        JITCache.__setitem__(self, key, fn)
+        self._try_export_to_storage(key, fn)
+    def __getitem__(self, key: CompileKeyType) -> CallableFunction:
+        # Use __contains__ to try populating in-memory cache with persistent storage
+        self.__contains__(key)
+        return JITCache.__getitem__(self, key)
+    def __contains__(self, key: CompileKeyType) -> bool:
+        # Checks in-memory cache first, then tries loading from storage.
+        # When returning True, guarantees the in-memory cache is populated.
+        if JITCache.__contains__(self, key):
+            return True
+        return self._try_load_from_storage(key)
+    def _try_load_from_storage(self, key: CompileKeyType) -> bool:
+        """
+        Try to load a function from persistent storage into in-memory cache.
+        Returns True if loaded successfully, False if not found on disk.
+        Holds a shared lock during loading to prevent concurrent writes.
+        """
+        sha256_hex = self._key_to_hash(key)
+        so_path = self.cache_path / f"{sha256_hex}.so"
+        with FileLock(
+            self._lock_path(sha256_hex),
+            exclusive=False,
+            timeout=self.LOCK_TIMEOUT_SECONDS,
+            label=sha256_hex,
+        ):
+            if so_path.exists():
+                logger.debug(
+                    "Loading compiled function from disk: %s", so_path
+                )
+                m = cute.runtime.load_module(
+                    str(so_path), enable_tvm_ffi=True
+                )
+                fn = getattr(m, self.EXPORT_FUNCTION_PREFIX)
+                JITCache.__setitem__(self, key, fn)
+                return True
+            else:
+                logger.debug(
+                    "Cache miss on disk for key hash %s", sha256_hex
+                )
+        return False
+    def _try_export_to_storage(
+        self, key: CompileKeyType, fn: JitCompiledFunction
+    ) -> None:
+        """Export a compiled function to persistent storage under exclusive lock."""
+        sha256_hex = self._key_to_hash(key)
+        with FileLock(
+            self._lock_path(sha256_hex),
+            exclusive=True,
+            timeout=self.LOCK_TIMEOUT_SECONDS,
+            label=sha256_hex,
+        ):
+            so_path = self.cache_path / f"{sha256_hex}.so"
+            if so_path.exists():
+                # Another process already exported.
+                logger.debug(
+                    "Skipping export, already on disk: %s", so_path
+                )
+                return
+            obj_path = self.cache_path / f"{sha256_hex}.o"
+            logger.debug(
+                "Exporting compiled function to disk: %s", so_path
+            )
+            fn.export_to_c(
+                object_file_path=str(obj_path),
+                function_name=self.EXPORT_FUNCTION_PREFIX,
+            )
+            # TODO: as of cutedsl 4.4.0, `export_to_c` only supports exporting
+            # "relocatable" .o files. But tvm_ffi expects "shared library" .so
+            # files. Link ourselves to workaround.
+            if JITPersistentCache._compiler is None:
+                JITPersistentCache._compiler = new_compiler()
+            JITPersistentCache._compiler.link_shared_object(
+                [str(obj_path)], str(so_path)
+            )
+            obj_path.unlink()
+            logger.debug(
+                "Successfully exported compiled function to disk: %s", so_path
+            )
+    def _key_to_hash(self, key: CompileKeyType) -> str:
+        return hashlib.sha256(pickle.dumps(key)).hexdigest()
+    def _lock_path(self, sha256_hex: str) -> Path:
+        return self.cache_path / f"{sha256_hex}.lock"
+    def clear(self) -> None:
+        """
+        Not only clear the in-memory cache. Also purge persistent compilation cache.
+        """
+        logger.debug(
+            "Clearing persistent cache at %s", self.cache_path
+        )
+        super().clear()
+        for child in self.cache_path.iterdir():
+            child.unlink()
+def get_jit_cache(name: str | None = None) -> JITCache:
+    """
+    JIT cache factory.
+    `name` is an optional identifier to create subdirectories to manage cache.
+    When persistent caching is enabled, artifacts are namespaced under a
+    source fingerprint directory so that code or dependency changes
+    automatically invalidate stale entries.
+    """
+    if CUTE_DSL_CACHE_ENABLED:
+        path = get_cache_path() / _compute_source_fingerprint()
+        if name:
+            path = path / name
+        logger.debug(
+            "Creating persistent JIT cache at %s", path
+        )
+        return JITPersistentCache(path)
+    else:
+        logger.debug("Persistent cache disabled, using in-memory JIT cache")
+        return JITCache()

build/torch-cuda/compute_block_sparsity.py ADDED Viewed

	@@ -0,0 +1,378 @@

+from functools import partial
+from typing import Callable, Optional, Tuple
+import cutlass
+import cutlass.cute as cute
+import torch
+from cutlass import Boolean, Int8, Int32, const_expr
+from .block_sparsity import (
+    BlockSparseTensors,
+    BlockSparseTensorsTorch,
+    to_cute_block_sparse_tensors,
+)
+from .utils import hash_callable, scalar_to_ssa, ssa_to_scalar
+from .seqlen_info import SeqlenInfoQK
+class BlockSparsityKernel:
+    """Block sparsity kernel for FlexAttention.
+    This kernel computes `mask_mod` for every token of each block
+    to determine if an n block is full, masked, or neither.
+    Writes block counts and indices to a BlockSparseTensors object.
+    When use_fast_sampling=True, uses 5-point sampling (4 corners + center)
+    which is much faster but only suitable for masks where this is sufficient.
+    TODO:
+        - optimize mask_mod evaluation
+        - varlen support
+        - transposed tensors for bwd pass
+    """
+    def __init__(
+        self,
+        mask_mod: Callable,
+        tile_mn: Tuple[int, int],
+        compute_full_blocks: bool = True,
+        use_aux_tensors: bool = False,
+        use_fast_sampling: bool = False,
+    ):
+        self.mask_mod = mask_mod
+        self.tile_mn = tile_mn
+        self.compute_full_blocks = compute_full_blocks
+        self.use_aux_tensors = use_aux_tensors
+        self.use_fast_sampling = use_fast_sampling
+    @cute.jit
+    def __call__(
+        self,
+        blocksparse_tensors: BlockSparseTensors,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        aux_tensors: Optional[list] = None,
+    ):
+        self.mask_cnt, self.mask_idx, self.full_cnt, self.full_idx = blocksparse_tensors
+        if const_expr(self.compute_full_blocks):
+            assert self.full_cnt is not None and self.full_idx is not None, (
+                "full block tensors must be provided when computing full blocks"
+            )
+        batch_size, num_heads, num_m_blocks, num_n_blocks = self.mask_idx.shape
+        # launch 1 CTA per m block
+        grid = [num_m_blocks, num_heads, batch_size]
+        if const_expr(self.use_fast_sampling):
+            num_threads = 5
+            self.num_warps = 1
+        else:
+            num_threads = self.tile_mn[0]
+            self.num_warps = (num_threads + 32 - 1) // 32
+        self.kernel(
+            self.mask_cnt,
+            self.mask_idx,
+            self.full_cnt,
+            self.full_idx,
+            num_n_blocks,
+            seqlen_q,
+            seqlen_k,
+            aux_tensors,
+        ).launch(grid=grid, block=[num_threads, 1, 1])
+    @cute.kernel
+    def kernel(
+        self,
+        mask_cnt: cute.Tensor,
+        mask_idx: cute.Tensor,
+        full_cnt: cute.Tensor,
+        full_idx: cute.Tensor,
+        num_n_blocks: Int32,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        aux_tensors: Optional[list] = None,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        warp_idx = cute.arch.warp_idx()
+        lane_id = cute.arch.lane_idx()
+        m_block, head_idx, batch_idx = cute.arch.block_idx()
+        ssa = partial(scalar_to_ssa, dtype=Int32)
+        seqlen = SeqlenInfoQK.create(
+            batch_idx,
+            seqlen_q,
+            seqlen_k,
+            mCuSeqlensQ=None,
+            mCuSeqlensK=None,
+            mSeqUsedQ=None,
+            mSeqUsedK=None,
+        )
+        @cute.struct
+        class SharedStorage:
+            reduction_buffer_smem: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int8, 2 * self.num_warps], 1024
+            ]
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(SharedStorage, 16)
+        reduction_buffer = storage.reduction_buffer_smem.get_tensor(
+            cute.make_layout((self.num_warps, 2))
+        )
+        num_mask_blocks = Int32(0)
+        num_full_blocks = Int32(0)
+        for n_block in cutlass.range(num_n_blocks, unroll_full=True):
+            m_base = m_block * self.tile_mn[0]
+            n_base = n_block * self.tile_mn[1]
+            if const_expr(self.use_fast_sampling):
+                # Fast path: 5-point sampling (4 corners + center)
+                # Clamps OOB indices to nearest in bounds.
+                thread_result = Boolean(False)
+                thread_is_valid = Boolean(False)
+                q_idx = Int32(0)
+                kv_idx = Int32(0)
+                if tidx == 0:
+                    # Top-left corner (0, 0); always in bounds
+                    q_idx = m_base
+                    kv_idx = n_base
+                elif tidx == 1:
+                    # Top-right corner
+                    q_idx = m_base
+                    kv_idx = cutlass.min(n_base + self.tile_mn[1] - 1, seqlen_k - 1)
+                elif tidx == 2:
+                    # Bottom-left corner
+                    q_idx = cutlass.min(m_base + self.tile_mn[0] - 1, seqlen_q - 1)
+                    kv_idx = n_base
+                elif tidx == 3:
+                    # Bottom-right corner
+                    q_idx = cutlass.min(m_base + self.tile_mn[0] - 1, seqlen_q - 1)
+                    kv_idx = cutlass.min(n_base + self.tile_mn[1] - 1, seqlen_k - 1)
+                elif tidx == 4:
+                    # Center point
+                    q_idx = m_base + (cutlass.min(seqlen_q - m_base, self.tile_mn[0])) // 2
+                    kv_idx = n_base + (cutlass.min(seqlen_k - n_base, self.tile_mn[1])) // 2
+                else:
+                    thread_is_valid = Boolean(False)
+                # Check bounds and determine if this thread has a valid index pair
+                if tidx < 5 and q_idx < seqlen_q and kv_idx < seqlen_k:
+                    thread_is_valid = Boolean(True)
+                    q_idx_ssa = ssa(q_idx)
+                    kv_idx_ssa = ssa(kv_idx)
+                    thread_result = ssa_to_scalar(
+                        self.mask_mod(
+                            ssa(batch_idx),
+                            ssa(head_idx),
+                            q_idx_ssa,
+                            kv_idx_ssa,
+                            seqlen,
+                            aux_tensors,
+                        )
+                    )
+                else:
+                    thread_is_valid = Boolean(False)
+                # Use vote_any_sync to see if any valid thread found unmasked or masked
+                # Only count results from threads that checked valid indices
+                has_unmasked = cute.arch.vote_any_sync(thread_result & thread_is_valid)
+                has_masked = cute.arch.vote_any_sync((Boolean(not thread_result)) & thread_is_valid)
+            else:
+                # Full path: check all elements in the block
+                # Track if this thread's row has any masked or unmasked elements
+                thread_has_unmasked = Boolean(False)
+                thread_has_masked = Boolean(False)
+                thread_is_valid = Boolean(False)
+                # Each thread handles 1 row
+                q_idx = m_base + tidx
+                kv_idx = Int32(0)
+                if tidx < self.tile_mn[0] and q_idx < seqlen_q:
+                    thread_is_valid = Boolean(True)
+                    q_idx_ssa = ssa(q_idx)
+                    # Loop over all columns in this row
+                    for c in cutlass.range(self.tile_mn[1], unroll_full=True):
+                        kv_idx = n_base + c
+                        kv_idx_ssa = ssa(kv_idx)
+                        # Only check elements within valid sequence bounds
+                        if kv_idx < seqlen_k:
+                            # Direct scalar call
+                            mask_val = ssa_to_scalar(
+                                self.mask_mod(
+                                    ssa(batch_idx),
+                                    ssa(head_idx),
+                                    q_idx_ssa,
+                                    kv_idx_ssa,
+                                    seqlen,
+                                    aux_tensors,
+                                )
+                            )
+                            # Update tracking flags
+                            if mask_val:
+                                thread_has_unmasked = Boolean(True)
+                            else:
+                                thread_has_masked = Boolean(True)
+                # Block-level reduction to combine results across all threads
+                # Only count votes from threads that checked valid indices
+                warp_has_unmasked_mask = cute.arch.vote_any_sync(
+                    thread_has_unmasked & thread_is_valid
+                )
+                warp_has_masked_mask = cute.arch.vote_any_sync(thread_has_masked & thread_is_valid)
+                # lane 0 writes the ballot mask to shared memory
+                lane_id = tidx % 32
+                if lane_id == 0:
+                    # Store as Int8
+                    reduction_buffer[warp_idx, 0] = Int8(1) if warp_has_unmasked_mask else Int8(0)
+                    reduction_buffer[warp_idx, 1] = Int8(1) if warp_has_masked_mask else Int8(0)
+                cute.arch.sync_threads()
+                # Thread 0 ORs all warp results together
+                has_unmasked = Boolean(False)
+                has_masked = Boolean(False)
+                if tidx == 0:
+                    for w in cutlass.range(self.num_warps):
+                        if reduction_buffer[w, 0]:
+                            has_unmasked = Boolean(True)
+                        if reduction_buffer[w, 1]:
+                            has_masked = Boolean(True)
+            # Only thread 0 updates the output arrays (common to both paths)
+            if tidx == 0:
+                # Block classification based on what we found:
+                # - If has_masked and has_unmasked: partial block (needs masking)
+                # - If only has_unmasked: full block (no masking needed)
+                # - If only has_masked: skip this block entirely
+                is_partial = Boolean(has_masked and has_unmasked)
+                is_full = Boolean(has_unmasked and (not has_masked))
+                if is_partial:
+                    mask_idx[batch_idx, head_idx, m_block, num_mask_blocks] = n_block
+                    num_mask_blocks += 1
+                elif is_full and const_expr(self.compute_full_blocks):
+                    full_idx[batch_idx, head_idx, m_block, num_full_blocks] = n_block
+                    num_full_blocks += 1
+        # Only thread 0 writes back the counts
+        if tidx == 0:
+            mask_cnt[batch_idx, head_idx, m_block] = num_mask_blocks
+            if const_expr(self.compute_full_blocks):
+                full_cnt[batch_idx, head_idx, m_block] = num_full_blocks
+def compute_block_sparsity(
+    tile_m,
+    tile_n,
+    batch_size,
+    num_heads,
+    seqlen_q,
+    seqlen_k,
+    mask_mod: Callable,
+    aux_tensors: Optional[list],  # list[cute.Tensor]
+    device,
+    compute_full_blocks: bool = True,
+    use_fast_sampling: bool = False,
+) -> Tuple[BlockSparseTensors, BlockSparseTensorsTorch]:
+    """
+    Computes block sparsity for a given `mask_mod`.
+    Args:
+        tile_m: The tile size for the m dimension.
+        tile_n: The tile size for the n dimension.
+        batch_size: The batch size.
+        num_heads: The number of heads.
+        seqlen_q: The sequence length for the query.
+        seqlen_k: The sequence length for the key.
+        mask_mod: The `mask_mod` callable to use.
+        aux_tensors: A list of auxiliary tensors.
+        device: The device to use.
+        compute_full_blocks: Whether to compute full blocks. If False, only partially-masked blocks are computed.
+        use_fast_sampling: Whether to use 5-point sampling (4 corners + center). This is much faster, but only suitable for masks where this check is sufficient.
+    Returns:
+        A tuple of `BlockSparseTensors` and `BlockSparseTensorsTorch`.
+    """
+    # Check if mask_mod is marked as suitable for 5-point fast sampling
+    use_fast_sampling = getattr(mask_mod, "use_fast_sampling", use_fast_sampling)
+    num_m_blocks = (seqlen_q + tile_m - 1) // tile_m
+    num_n_blocks = (seqlen_k + tile_n - 1) // tile_n
+    mask_block_cnt = torch.zeros(
+        (batch_size, num_heads, num_m_blocks), device=device, dtype=torch.int32
+    )
+    mask_block_idx = torch.zeros(
+        (batch_size, num_heads, num_m_blocks, num_n_blocks), device=device, dtype=torch.int32
+    )
+    full_block_cnt = (
+        torch.zeros((batch_size, num_heads, num_m_blocks), device=device, dtype=torch.int32)
+        if compute_full_blocks
+        else None
+    )
+    full_block_idx = (
+        torch.zeros(
+            (batch_size, num_heads, num_m_blocks, num_n_blocks), device=device, dtype=torch.int32
+        )
+        if compute_full_blocks
+        else None
+    )
+    blocksparse_tensors_torch = BlockSparseTensorsTorch(
+        mask_block_cnt=mask_block_cnt,
+        mask_block_idx=mask_block_idx,
+        full_block_cnt=full_block_cnt,
+        full_block_idx=full_block_idx,
+        block_size=(tile_m, tile_n),
+    )
+    mask_mod_hash = hash_callable(mask_mod)
+    blocksparse_tensors = to_cute_block_sparse_tensors(
+        blocksparse_tensors_torch, enable_tvm_ffi=True
+    )
+    compile_key = (
+        tile_m,
+        tile_n,
+        mask_mod_hash,
+        compute_full_blocks,
+        aux_tensors is not None,
+        use_fast_sampling,
+    )
+    if compile_key not in compute_block_sparsity.compile_cache:
+        kernel = BlockSparsityKernel(
+            mask_mod,
+            tile_mn=(tile_m, tile_n),
+            compute_full_blocks=compute_full_blocks,
+            use_aux_tensors=aux_tensors is not None,
+            use_fast_sampling=use_fast_sampling,
+        )
+        compute_block_sparsity.compile_cache[compile_key] = cute.compile(
+            kernel, blocksparse_tensors, seqlen_q, seqlen_k, aux_tensors, options="--enable-tvm-ffi"
+        )
+    compute_block_sparsity.compile_cache[compile_key](
+        blocksparse_tensors_torch[:4],
+        seqlen_q,
+        seqlen_k,
+        aux_tensors,
+    )
+    return blocksparse_tensors, blocksparse_tensors_torch
+compute_block_sparsity.compile_cache = {}

build/torch-cuda/copy_utils.py ADDED Viewed

	@@ -0,0 +1,372 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import math
+from typing import Optional, Type, Callable
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute.nvgpu import cpasync
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+import cutlass.pipeline
+@dsl_user_op
+def cvt_copy(
+    atom: cute.CopyAtom,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    if const_expr(src.element_type != dst.element_type):
+        src_cvt = cute.make_fragment_like(src, dst.element_type, loc=loc, ip=ip)
+        src_cvt.store(src.load().to(dst.element_type))
+        src = src_cvt
+    cute.copy(atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    dst = cute.make_fragment_like(src, src.element_type, loc=loc, ip=ip)
+    cute.autovec_copy(src, dst, loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def get_copy_atom(
+    dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
+) -> cute.CopyAtom:
+    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+@dsl_user_op
+def make_tmem_copy(
+    tmem_copy_atom: cute.CopyAtom, num_wg: int = 1, *, loc=None, ip=None
+) -> cute.CopyAtom:
+    num_dp, num_bits, num_rep, _ = sm100_utils.get_tmem_copy_properties(tmem_copy_atom)
+    assert num_dp == 32
+    assert num_bits == 32
+    tiler_mn = (cute.make_layout((128 * num_rep * num_wg // 32, 32), stride=(32, 1)),)
+    layout_tv = cute.make_layout(
+        ((32, 4, num_wg), (num_rep, 32)), stride=((0, 1, 4 * num_rep), (4, 4 * num_rep * num_wg))
+    )
+    return cute.make_tiled_copy(tmem_copy_atom, layout_tv, tiler_mn)
+@dsl_user_op
+def copy(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async)
+    cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+def tiled_copy_1d(
+    dtype: Type[cutlass.Numeric], num_threads: int, num_copy_elems: int = 1, is_async: bool = False
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    thr_layout = cute.make_layout(num_threads)
+    val_layout = cute.make_layout(num_copy_elems)
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def tiled_copy_2d(
+    dtype: Type[cutlass.Numeric], major_mode_size: int, num_threads: int, is_async: bool = False
+) -> cute.TiledCopy:
+    num_copy_bits = math.gcd(major_mode_size, 128 // dtype.width) * dtype.width
+    copy_elems = num_copy_bits // dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    gmem_threads_per_row = major_mode_size // copy_elems
+    assert num_threads % gmem_threads_per_row == 0
+    thr_layout = cute.make_ordered_layout(
+        (num_threads // gmem_threads_per_row, gmem_threads_per_row),
+        order=(1, 0),
+    )
+    val_layout = cute.make_layout((1, copy_elems))
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+@dsl_user_op
+def atomic_add_fp32x4(
+    a: Float32, b: Float32, c: Float32, d: Float32, gmem_ptr: cute.Pointer, *, loc=None, ip=None
+) -> None:
+    gmem_ptr_i64 = gmem_ptr.toint(loc=loc, ip=ip).ir_value()
+    # cache_hint = cutlass.Int64(0x12F0000000000000)
+    llvm.inline_asm(
+        None,
+        [
+            gmem_ptr_i64,
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            Float32(c).ir_value(loc=loc, ip=ip),
+            Float32(d).ir_value(loc=loc, ip=ip),
+        ],
+        # [gmem_ptr_i64, Float32(a).ir_value(loc=loc, ip=ip), cache_hint.ir_value()],
+        "{\n\t"
+        # ".reg .b128 abcd;\n\t"
+        # "mov.b128 abcd, {$1, $2, $3, $4};\n\t"
+        ".reg .v4 .f32 abcd;\n\t"
+        # "mov.b128 abcd, {$1, $2, $3, $4};\n\t"
+        "mov.f32 abcd.x, $1;\n\t"
+        "mov.f32 abcd.y, $2;\n\t"
+        "mov.f32 abcd.z, $3;\n\t"
+        "mov.f32 abcd.w, $4;\n\t"
+        "red.global.add.v4.f32 [$0], abcd;\n\t"
+        # "red.global.add.L2::cache_hint.v4.f32 [$0], abcd, 0x14F0000000000000;\n\t"
+        "}\n",
+        # "red.global.add.L2::cache_hint.f32 [$0], $1, 0x12F0000000000000;",
+        # "red.global.add.L2::cache_hint.f32 [$0], $1, $2;",
+        "l,f,f,f,f",
+        # "l,f,l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def set_block_rank(
+    smem_ptr: cute.Pointer, peer_cta_rank_in_cluster: Int32, *, loc=None, ip=None
+) -> Int32:
+    """Map the given smem pointer to the address at another CTA rank in the cluster."""
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    return Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [smem_ptr_i32, peer_cta_rank_in_cluster.ir_value()],
+            "mapa.shared::cluster.u32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def store_shared_remote_fp32x4(
+    a: Float32,
+    b: Float32,
+    c: Float32,
+    d: Float32,
+    smem_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: Int32,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    remote_smem_ptr_i32 = set_block_rank(
+        smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    remote_mbar_ptr_i32 = set_block_rank(
+        mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    llvm.inline_asm(
+        None,
+        [
+            remote_smem_ptr_i32,
+            remote_mbar_ptr_i32,
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            Float32(c).ir_value(loc=loc, ip=ip),
+            Float32(d).ir_value(loc=loc, ip=ip),
+        ],
+        "{\n\t"
+        ".reg .v4 .f32 abcd;\n\t"
+        "mov.f32 abcd.x, $2;\n\t"
+        "mov.f32 abcd.y, $3;\n\t"
+        "mov.f32 abcd.z, $4;\n\t"
+        "mov.f32 abcd.w, $5;\n\t"
+        "st.async.shared::cluster.mbarrier::complete_tx::bytes.v4.f32 [$0], abcd, [$1];\n\t"
+        "}\n",
+        "r,r,f,f,f,f",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def cpasync_bulk_s2cluster(
+    smem_src_ptr: cute.Pointer,
+    smem_dst_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    size: int | Int32,
+    peer_cta_rank_in_cluster: Int32,
+    *,
+    loc=None,
+    ip=None,
+):
+    smem_src_ptr_i32 = smem_src_ptr.toint(loc=loc, ip=ip).ir_value()
+    smem_dst_ptr_i32 = set_block_rank(
+        smem_dst_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    mbar_ptr_i32 = set_block_rank(mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [
+            smem_dst_ptr_i32,
+            smem_src_ptr_i32,
+            mbar_ptr_i32,
+            Int32(size).ir_value(loc=loc, ip=ip),
+        ],
+        "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [$0], [$1], $3, [$2];",
+        "r,r,r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def cpasync_bulk_g2s(
+    gmem_ptr: cute.Pointer,
+    smem_ptr: cute.Pointer,
+    tma_bar_ptr: cute.Pointer,
+    size: int | Int32,
+    *,
+    loc=None,
+    ip=None,
+):
+    gmem_ptr_i64 = gmem_ptr.toint(loc=loc, ip=ip).ir_value()
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    mbar_ptr_i32 = tma_bar_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [gmem_ptr_i64, smem_ptr_i32, mbar_ptr_i32, Int32(size).ir_value()],
+        "cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [$1], [$0], $3, [$2];",
+        "l,r,r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def cpasync_reduce_bulk_add_f32(
+    smem_ptr: cute.Pointer,
+    gmem_ptr: cute.Pointer,
+    store_bytes: int | Int32,
+    *,
+    loc=None,
+    ip=None,
+):
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    # cache_hint = cutlass.Int64(0x14F0000000000000)  # EVICT_LAST
+    llvm.inline_asm(
+        None,
+        [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value()],
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;",
+        "l,r,r",
+        # [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value(), cache_hint.ir_value()],
+        # "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [$0], [$1], $2, $3;",
+        # "l,r,r,l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+def cpasync_bulk_get_copy_fn(
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    single_stage: bool = False,
+    **kwargs,
+) -> Callable:
+    # src_is_smem = const_expr(
+    #     isinstance(src_tensor.iterator, cute.Pointer)
+    #     and src_tensor.memspace == cute.AddressSpace.smem
+    # )
+    group_rank_src = const_expr(cute.rank(src_tensor) - (1 if not single_stage else 0))
+    group_rank_dst = const_expr(cute.rank(dst_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    src = cute.group_modes(src_tensor, 0, group_rank_src)
+    dst = cute.group_modes(dst_tensor, 0, group_rank_dst)
+    def copy_bulk(src_idx, dst_idx, **new_kwargs):
+        size = const_expr(cute.size(src.shape[:-1]) * src.element_type.width // 8)
+        cpasync_bulk_g2s(
+            src[None, src_idx].iterator,
+            dst[None, dst_idx].iterator,
+            size=size,
+            **new_kwargs,
+            **kwargs,
+        )
+    def copy_bulk_single_stage(**new_kwargs):
+        size = const_expr(cute.size(src.shape) * src.element_type.width // 8)
+        cpasync_bulk_g2s(src.iterator, dst.iterator, size=size, **new_kwargs, **kwargs)
+    return copy_bulk if const_expr(not single_stage) else copy_bulk_single_stage
+def tma_get_copy_fn(
+    atom: cute.CopyAtom,
+    cta_coord: cute.Coord,
+    cta_layout: cute.Layout,
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    filter_zeros: bool = False,
+    single_stage: bool = False,
+    **kwargs,
+) -> Callable:
+    src_is_smem = const_expr(
+        isinstance(src_tensor.iterator, cute.Pointer)
+        and src_tensor.memspace == cute.AddressSpace.smem
+    )
+    smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    group_rank_smem = const_expr(cute.rank(smem_tensor) - (1 if not single_stage else 0))
+    group_rank_gmem = const_expr(cute.rank(gmem_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    s, g = cpasync.tma_partition(
+        atom,
+        cta_coord,
+        cta_layout,
+        cute.group_modes(smem_tensor, 0, group_rank_smem),
+        cute.group_modes(gmem_tensor, 0, group_rank_gmem),
+    )
+    if const_expr(filter_zeros):
+        s = cute.filter_zeros(s)
+        g = cute.filter_zeros(g)
+    src, dst = (s, g) if src_is_smem else (g, s)
+    def copy_tma(src_idx, dst_idx, **new_kwargs):
+        cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
+    def copy_tma_single_stage(**new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs)
+    return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
+def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):
+    def copy_fn(src_idx, producer_state: cutlass.pipeline.PipelineState, **new_kwargs):
+        copy(
+            src_idx=src_idx,
+            dst_idx=producer_state.index,
+            tma_bar_ptr=pipeline.producer_get_barrier(producer_state),
+            **new_kwargs,
+        )
+    return copy_fn

build/torch-cuda/cute_dsl_ptxas.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+System ptxas replacement for CUTLASS DSL.
+Environment variables:
+    CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
+    CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
+"""
+import os
+import sys
+import re
+import ctypes
+import subprocess
+from pathlib import Path
+import cutlass
+CUTE_DSL_PTXAS_PATH = os.environ.get("CUTE_DSL_PTXAS_PATH", None)
+VERBOSE = os.environ.get("CUTE_DSL_PTXAS_VERBOSE", "0") == "1"
+_original_load_cuda_library = None
+_user_wanted_ptx = False  # True if user originally set CUTE_DSL_KEEP_PTX=1
+def _log(msg):
+    if VERBOSE:
+        print(f"[ptxas] {msg}", file=sys.stderr)
+def _get_ptx(compiled_func) -> tuple[str, Path] | None:
+    """Find and read PTX file, stripping null bytes."""
+    func_name = getattr(compiled_func, "function_name", None)
+    if not func_name:
+        return None
+    dump_dir = os.environ.get("CUTE_DSL_DUMP_DIR", Path.cwd())
+    for ptx_path in Path(dump_dir).glob(f"*{func_name}*.ptx"):
+        content = ptx_path.read_text().rstrip("\x00")
+        if ".entry " in content and content.rstrip().endswith("}"):
+            _log(f"Found PTX: {ptx_path}")
+            return content, ptx_path
+    return None
+def _compile_ptx(ptx_path: Path, ptx_content: str) -> bytes:
+    """Compile PTX to cubin using system ptxas."""
+    # Extract arch from PTX
+    match = re.search(r"\.target\s+(sm_\d+[a-z]?)", ptx_content)
+    arch = match.group(1) if match else "sm_90a"
+    # Write stripped content back if needed
+    if ptx_path.read_text() != ptx_content:
+        ptx_path.write_text(ptx_content)
+    # Compile
+    cubin_tmp = ptx_path.with_suffix(".cubin.tmp")
+    try:
+        assert CUTE_DSL_PTXAS_PATH is not None
+        result = subprocess.run(
+            [CUTE_DSL_PTXAS_PATH, f"-arch={arch}", "-O3", "-o", str(cubin_tmp), str(ptx_path)],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"ptxas failed: {result.stderr}")
+        cubin_data = cubin_tmp.read_bytes()
+        _log(f"Compiled {ptx_path.name} -> {len(cubin_data)} bytes ({arch})")
+        # Save cubin if CUTE_DSL_KEEP_CUBIN is set
+        if os.environ.get("CUTE_DSL_KEEP_CUBIN", "0") == "1":
+            cubin_out = ptx_path.with_suffix(".cubin")
+            cubin_out.write_bytes(cubin_data)
+            _log(f"Saved: {cubin_out}")
+        return cubin_data
+    finally:
+        cubin_tmp.unlink(missing_ok=True)
+def _patched_load_cuda_library(self):
+    """Replacement for _load_cuda_library that uses system ptxas."""
+    result = _get_ptx(self)
+    if not result:
+        _log("PTX not found, falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    ptx_content, ptx_path = result
+    try:
+        cubin = _compile_ptx(ptx_path, ptx_content)
+    except Exception as e:
+        _log(f"Compilation failed ({e}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Load cubin
+    import cuda.bindings.runtime as cuda_runtime
+    err, library = cuda_runtime.cudaLibraryLoadData(cubin, None, None, 0, None, None, 0)
+    if err != cuda_runtime.cudaError_t.cudaSuccess:
+        _log(f"cudaLibraryLoadData failed ({err}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Register kernels on all devices
+    _, cuda_load_to_device = self._get_cuda_init_and_load()
+    lib_ptr = ctypes.c_void_p(int(library))
+    dev_id = ctypes.c_int32(0)
+    err_val = ctypes.c_int32(0)
+    args = (ctypes.c_void_p * 3)(
+        ctypes.cast(ctypes.pointer(lib_ptr), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(dev_id), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(err_val), ctypes.c_void_p),
+    )
+    for dev in range(self.num_devices):
+        dev_id.value = dev
+        cuda_load_to_device(args)
+        if err_val.value != 0:
+            _log("cuda_load_to_device failed, falling back to embedded ptxas")
+            return _original_load_cuda_library(self)
+    _log(f"Loaded kernel from {ptx_path.name}")
+    # Delete PTX if user didn't originally want it kept
+    if not _user_wanted_ptx:
+        ptx_path.unlink(missing_ok=True)
+    return [cuda_runtime.cudaLibrary_t(lib_ptr.value)]
+def patch():
+    """Install system ptxas hook. Call before importing cutlass."""
+    global _original_load_cuda_library, _user_wanted_ptx
+    assert CUTE_DSL_PTXAS_PATH is not None
+    if not os.path.isfile(CUTE_DSL_PTXAS_PATH) or not os.access(CUTE_DSL_PTXAS_PATH, os.X_OK):
+        raise RuntimeError(f"ptxas not found: {CUTE_DSL_PTXAS_PATH}")
+    # Track if user originally wanted PTX kept
+    _user_wanted_ptx = os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1"
+    # os.environ['CUTE_DSL_KEEP_PTX'] = '1'
+    assert os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1", (
+        "Require CUTE_DSL_KEEP_PTX=1 to use system's ptxas"
+    )
+    cls = cutlass.cutlass_dsl.cuda_jit_executor.CudaDialectJitCompiledFunction
+    _original_load_cuda_library = cls._load_cuda_library
+    cls._load_cuda_library = _patched_load_cuda_library
+    _log("Patch applied")
+    return

build/torch-cuda/cute_dsl_utils.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) 2025, Tri Dao.
+import os
+import pathlib
+from typing import Tuple
+from functools import partial, lru_cache
+from dataclasses import dataclass, fields
+import torch
+try:
+    from triton.tools.disasm import extract
+except ImportError:
+    extract = None
+import cutlass
+import cutlass.cute as cute
+from cutlass.base_dsl.typing import JitArgument
+from cutlass.cutlass_dsl import NumericMeta
+from cutlass.cute.runtime import from_dlpack
+StaticTypes = (cutlass.Constexpr, NumericMeta, int, bool, str, float, type(None))
+load_cubin_module_data_og = cutlass.base_dsl.runtime.cuda.load_cubin_module_data
+cute_compile_og = cute.compile
+torch2cute_dtype_map = {
+    torch.float16: cutlass.Float16,
+    torch.bfloat16: cutlass.BFloat16,
+    torch.float32: cutlass.Float32,
+}
+@lru_cache
+def get_max_active_clusters(cluster_size):
+    return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+@lru_cache
+def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
+    return torch.cuda.get_device_capability(device)
+@dataclass
+class ArgumentsBase(JitArgument):
+    def __c_pointers__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
+        c_ptrs = []
+        for obj in non_constexpr_fields:
+            if hasattr(obj, "__c_pointers__"):
+                c_ptrs.extend(obj.__c_pointers__())
+        return c_ptrs
+    def __get_mlir_types__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
+        types, self._values_pos = [], []
+        for obj in non_constexpr_fields:
+            if hasattr(obj, "__get_mlir_types__"):
+                obj_types = obj.__get_mlir_types__()
+                types.extend(obj_types)
+                self._values_pos.append(len(obj_types))
+            else:
+                self._values_pos.append(0)
+        return types
+    def __new_from_mlir_values__(self, values):
+        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
+        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
+        non_constexpr_fields = {
+            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
+        }
+        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+            values = values[n_items:]
+        return self.__class__(**non_constexpr_fields, **constexpr_fields)
+def load_cubin_module_data_patched(cubin_data, filepath):
+    pathlib.Path(filepath).write_bytes(cubin_data)
+    return load_cubin_module_data_og(cubin_data)
+def cute_compile_patched(*args, **kwargs):
+    """A patched version of cute.compile that dump the SASS to a file if CUTE_CUBIN_PATH is set."""
+    cubin_path = os.getenv("CUTE_CUBIN_PATH", None)
+    if cubin_path is not None:
+        cutlass.base_dsl.runtime.cuda.load_cubin_module_data = partial(
+            load_cubin_module_data_patched, filepath=cubin_path
+        )
+    output = cute_compile_og(*args, **kwargs)
+    if cubin_path is not None:
+        cutlass.base_dsl.runtime.cuda.load_cubin_module_data = load_cubin_module_data_og
+        if extract is not None:
+            sass = extract(cubin_path, None)
+            pathlib.Path(cubin_path).with_suffix(".annotated.sass").write_text(sass)
+    return output
+def assume_strides_aligned(t):
+    """Assume all strides except the last are divisible by 128 bits.
+    Python int strides (e.g., stride=0 from GQA expand) are kept as-is
+    since they're static and don't need alignment assumptions.
+    """
+    divby = 128 // t.element_type.width
+    strides = tuple(s if isinstance(s, int) else cute.assume(s, divby=divby) for s in t.stride[:-1])
+    return (*strides, t.stride[-1])
+def assume_tensor_aligned(t):
+    """Rebuild a tensor with 128-bit aligned stride assumptions. Passes through None."""
+    if t is None:
+        return None
+    return cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=assume_strides_aligned(t)))
+def to_cute_tensor(t, assumed_align=16, leading_dim=-1, fully_dynamic=False, enable_tvm_ffi=True):
+    """Convert torch tensor to cute tensor for TVM FFI. leading_dim=-1 defaults to t.ndim-1."""
+    tensor = from_dlpack(t.detach(), assumed_align=assumed_align, enable_tvm_ffi=enable_tvm_ffi)
+    if fully_dynamic:
+        return tensor.mark_layout_dynamic()
+    if leading_dim == -1:
+        leading_dim = t.ndim - 1
+    return tensor.mark_layout_dynamic(leading_dim=leading_dim)
+def to_cute_aux_tensor(t, enable_tvm_ffi=True):
+    """Convert torch tensor to cute tensor for TVM FFI, tailored to FlexAttention aux tensors.
+    This allows the user to specify alignment and leading dimension for aux tensors used in
+    custom score_mod callables.
+    """
+    assumed_align: int = getattr(t, "__assumed_align__", None)
+    leading_dim: int = getattr(t, "__leading_dim__", None)
+    fully_dynamic: bool = leading_dim is None
+    return to_cute_tensor(
+        t,
+        assumed_align=assumed_align,
+        leading_dim=leading_dim,
+        fully_dynamic=fully_dynamic,
+        enable_tvm_ffi=enable_tvm_ffi,
+    )
+def get_aux_tensor_metadata(aux_tensors):
+    return tuple(
+        (
+            getattr(t, "__assumed_align__", 0),
+            getattr(t, "__leading_dim__", -1),
+            hasattr(t, "__leading_dim__"),
+        )
+        for t in aux_tensors
+    )
+def get_broadcast_dims(tensor: torch.Tensor) -> Tuple[bool, ...]:
+    """Return tuple of bools indicating which dims have stride=0 (broadcast).
+    This is useful for compile keys since CuTe's mark_layout_dynamic() keeps
+    stride=0 as static, meaning kernels compiled with different broadcast
+    patterns are not interchangeable.
+    """
+    return tuple(s == 0 for s in tensor.stride())

build/torch-cuda/fast_math.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2025, Tri Dao.
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32
+@cute.jit
+def clz(x: Int32) -> Int32:
+    # for i in cutlass.range_constexpr(32):
+    #     if (1 << (31 - i)) & x:
+    #         return Int32(i)
+    # return Int32(32)
+    # Early exit is not supported yet
+    res = Int32(32)
+    done = False
+    for i in cutlass.range(32):
+        if ((1 << (31 - i)) & x) and not done:
+            res = Int32(i)
+            done = True
+    return res

build/torch-cuda/flash_attn4/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-cuda/flash_bwd.py ADDED Viewed

	@@ -0,0 +1,1264 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/mainloop_bwd_sm80.hpp
+# from Cutlass C++ to Cute-DSL.
+import math
+from types import SimpleNamespace
+from typing import Type, Callable, Optional
+from functools import partial
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, warp
+from cutlass import Float32, Int32
+import cutlass.utils as utils_basic
+from .quack import layout_utils
+from . import ampere_helpers as sm80_utils
+from .cute_dsl_utils import assume_tensor_aligned
+from . import utils
+from .mask import AttentionMask
+from .seqlen_info import SeqlenInfoQK
+from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import SingleTileScheduler, SingleTileVarlenScheduler, TileSchedulerArguments
+class FlashAttentionBackwardSm80:
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        head_dim: int,
+        head_dim_v: Optional[int] = None,
+        qhead_per_kvhead: int = 1,
+        m_block_size: int = 64,
+        n_block_size: int = 128,
+        num_stages_Q: int = 2,
+        num_stages_dO: int = 2,
+        num_threads: int = 256,
+        pack_gqa: bool = False,
+        is_causal: bool = False,
+        SdP_swapAB: bool = False,
+        dKV_swapAB: bool = False,
+        dQ_swapAB: bool = False,
+        AtomLayoutMSdP: int = 1,
+        AtomLayoutNdKV: int = 8,
+        AtomLayoutMdQ: int = 1,
+        V_in_regs: bool = False,
+    ):
+        """Initializes the configuration for a flash attention v2 kernel.
+        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
+        should be a multiple of 8.
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param n_block_size: n block size
+        :type n_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        :param is_causal: is causal
+        """
+        self.dtype = dtype
+        # padding head_dim to a multiple of 16 as k_block_size
+        hdim_multiple_of = 32
+        self.head_dim_padded = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
+        head_dim_v = head_dim_v if head_dim_v is not None else head_dim
+        self.same_hdim_kv = head_dim == head_dim_v
+        self.head_dim_v_padded = int(math.ceil(head_dim_v / hdim_multiple_of) * hdim_multiple_of)
+        # Can save registers (and hence be faster) if we don't have to check hdim predication
+        self.check_hdim_oob = head_dim != self.head_dim_padded
+        self.check_hdim_v_oob = head_dim_v != self.head_dim_v_padded
+        self.qhead_per_kvhead = qhead_per_kvhead
+        self.m_block_size = m_block_size
+        self.n_block_size = n_block_size
+        self.num_threads = num_threads
+        self.pack_gqa = pack_gqa
+        self.is_causal = is_causal
+        self.num_stages_Q = num_stages_Q
+        self.num_stages_dO = num_stages_dO
+        self.SdP_swapAB = SdP_swapAB
+        self.dKV_swapAB = dKV_swapAB
+        self.dQ_swapAB = dQ_swapAB
+        self.AtomLayoutMSdP = AtomLayoutMSdP
+        self.AtomLayoutNdKV = AtomLayoutNdKV
+        self.AtomLayoutMdQ = AtomLayoutMdQ
+        num_mma_warps = self.num_threads // cute.arch.WARP_SIZE
+        self.Mma_dKV_is_RS = AtomLayoutMSdP == 1 and AtomLayoutNdKV == num_mma_warps and SdP_swapAB and not dKV_swapAB
+        self.V_in_regs = V_in_regs
+        self.share_QV_smem = V_in_regs
+    @staticmethod
+    def can_implement(
+        dtype, head_dim, head_dim_v, m_block_size, n_block_size, num_stages_Q, num_stages_dO,
+        num_threads, is_causal,
+        V_in_regs=False
+    ) -> bool:
+        """Check if the kernel can be implemented with the given parameters.
+        :param dtype: data type
+        :type dtype: cutlass.Numeric
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param n_block_size: n block size
+        :type n_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        :param is_causal: is causal
+        :type is_causal: bool
+        :return: True if the kernel can be implemented, False otherwise
+        :rtype: bool
+        """
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if head_dim_v % 8 != 0:
+            return False
+        if n_block_size % 16 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        # Check if block size setting is out of shared memory capacity
+        # Shared memory usage: Q tile + (K tile + V tile) where K and V use the same tile size
+        smem_usage_Q = m_block_size * head_dim * num_stages_Q * 2
+        smem_usage_dO = m_block_size * head_dim_v * num_stages_dO * 2
+        smem_usage_K = n_block_size * head_dim * 2
+        smem_usage_V = n_block_size * head_dim_v * 2
+        smem_usage_QV = (smem_usage_Q + smem_usage_V) if not V_in_regs else max(smem_usage_Q, smem_usage_V)
+        smem_usage = smem_usage_QV + smem_usage_dO + smem_usage_K
+        smem_capacity = utils_basic.get_smem_capacity_in_bytes("sm_80")
+        if smem_usage > smem_capacity:
+            return False
+        return True
+    def _check_type(
+        self,
+        mQ_type: Type[cutlass.Numeric],
+        mK_type: Type[cutlass.Numeric],
+        mV_type: Type[cutlass.Numeric],
+        mdO_type: Type[cutlass.Numeric],
+        mLSE_type: Type[cutlass.Numeric],
+        mdPsum_type: Type[cutlass.Numeric],
+        mdQaccum_type: Type[cutlass.Numeric],
+        mdK_type: Type[cutlass.Numeric],
+        mdV_type: Type[cutlass.Numeric],
+        mCuSeqlensQ_type: Type[cutlass.Numeric] | None,
+        mCuSeqlensK_type: Type[cutlass.Numeric] | None,
+        mSeqUsedQ_type: Type[cutlass.Numeric] | None,
+        mSeqUsedK_type: Type[cutlass.Numeric] | None,
+    ):
+        if cutlass.const_expr(not (mQ_type == mK_type == mV_type == mdO_type)):
+            raise TypeError("All tensors must have the same data type")
+        if cutlass.const_expr(self.qhead_per_kvhead == 1):
+            if cutlass.const_expr(not (mdK_type == mdV_type == mQ_type)):
+                raise TypeError("mdK and mdV tensors must have the same data type as mQ")
+        else:
+            if cutlass.const_expr(not (mdK_type == mdV_type == cutlass.Float32)):
+                raise TypeError("mdKaccum and mdVaccum tensors must have the data type Float32")
+        if cutlass.const_expr(not mQ_type in [cutlass.Float16, cutlass.BFloat16]):
+            raise TypeError("Only Float16 or BFloat16 is supported")
+        if cutlass.const_expr(not mLSE_type in [cutlass.Float32]):
+            raise TypeError("LSE tensor must be Float32")
+        if cutlass.const_expr(not mdPsum_type in [cutlass.Float32]):
+            raise TypeError("dPsum tensor must be Float32")
+        if cutlass.const_expr(not mdQaccum_type in [cutlass.Float32]):
+            raise TypeError("dQaccum tensor must be Float32")
+        if cutlass.const_expr(mCuSeqlensQ_type not in [None, cutlass.Int32]):
+            raise TypeError("cuSeqlensQ tensor must be Int32")
+        if cutlass.const_expr(mCuSeqlensK_type not in [None, cutlass.Int32]):
+            raise TypeError("cuSeqlensK tensor must be Int32")
+        if cutlass.const_expr(mSeqUsedQ_type not in [None, cutlass.Int32]):
+            raise TypeError("SeqUsedQ tensor must be Int32")
+        if cutlass.const_expr(mSeqUsedK_type not in [None, cutlass.Int32]):
+            raise TypeError("SeqUsedK tensor must be Int32")
+        assert mQ_type == self.dtype
+    def _setup_attributes(self):
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory layout: Q/K/V
+        # ///////////////////////////////////////////////////////////////////////////////
+        sQ_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, self.head_dim_padded)
+        self.sQ_layout = cute.tile_to_shape(
+            sQ_layout_atom, (self.m_block_size, self.head_dim_padded, self.num_stages_Q), (0, 1, 2),
+        )
+        sK_layout_atom = sQ_layout_atom
+        self.sK_layout = cute.tile_to_shape(
+            sK_layout_atom, (self.n_block_size, self.head_dim_padded), (0, 1),
+        )
+        sV_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, self.head_dim_v_padded)
+        self.sV_layout = cute.tile_to_shape(
+            sV_layout_atom, (self.n_block_size, self.head_dim_v_padded), (0, 1),
+        )
+        sdO_layout_atom = sV_layout_atom
+        self.sdO_layout = cute.tile_to_shape(
+            sdO_layout_atom, (self.m_block_size, self.head_dim_v_padded, self.num_stages_dO), (0, 1, 2),
+        )
+        # TODO: do we set swizzle to be 3 here explicitly?
+        sPdS_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, self.n_block_size)
+        self.sPdS_layout = cute.tile_to_shape(
+            sPdS_layout_atom, (self.m_block_size, self.n_block_size), (0, 1),
+        )
+        # We set stride to be multiple of 64 so that if ShuffleLSE, even if threads read from sLSE but out of bounds,
+        # it's still a valid smem address.
+        self.sLSE_layout = cute.make_layout(
+            (self.m_block_size, self.num_stages_Q),
+            stride=(1, cute.round_up(self.m_block_size, 64)),
+        )
+        sLSEMma_layout = cute.make_layout(
+            (self.m_block_size, self.n_block_size, self.num_stages_Q),
+            stride=(1, 0, cute.round_up(self.m_block_size, 64)),
+        )
+        sLSEMma_layout_transposed = cute.make_layout(
+            (self.n_block_size, self.m_block_size, self.num_stages_Q),
+            stride=(0, 1, cute.round_up(self.m_block_size, 64)),
+        )
+        self.sLSEMma_layout = sLSEMma_layout if not self.SdP_swapAB else sLSEMma_layout_transposed
+        # ///////////////////////////////////////////////////////////////////////////////
+        # GMEM Tiled copy:
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Thread layouts for copies
+        universal_copy_bits = 128
+        async_copy_elems = universal_copy_bits // self.dtype.width
+        # atom_async_copy: async copy atom for QKV load
+        atom_async_copy = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            self.dtype,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        # atom_universal_copy: universal copy atom for O store
+        atom_universal_copy = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(), self.dtype, num_bits_per_copy=universal_copy_bits,
+        )
+        # tQK_layout: thread layout for QK load
+        tQK_shape_dim_1 = sQ_layout_atom.outer.shape[1] // async_copy_elems
+        assert self.num_threads % tQK_shape_dim_1 == 0, "num_threads must be divisible by tQK_shape_dim_1"
+        tQK_layout = cute.make_ordered_layout(
+            (self.num_threads // tQK_shape_dim_1, tQK_shape_dim_1), order=(1, 0),
+        )
+        # Do we need to check if we overshot kBlockM when we load Q?
+        self.is_even_m_smem_q = self.m_block_size % tQK_layout.shape[0] == 0
+        # Do we need to check if we overshot kBlockN when we load K?
+        self.is_even_n_smem_k = self.n_block_size % tQK_layout.shape[0] == 0
+        tVdO_shape_dim_1 = sV_layout_atom.outer.shape[1] // async_copy_elems
+        assert self.num_threads % tVdO_shape_dim_1 == 0, "num_threads must be divisible by tVdO_shape_dim_1"
+        tVdO_layout = cute.make_ordered_layout(
+            (self.num_threads // tVdO_shape_dim_1, tVdO_shape_dim_1), order=(1, 0),
+        )
+        # Do we need to check if we overshot kBlockN when we load V?
+        self.is_even_n_smem_v = self.n_block_size % tVdO_layout.shape[0] == 0
+        self.is_even_m_smem_do = self.m_block_size % tVdO_layout.shape[0] == 0
+        # Value layouts for copies
+        vQKVdO_layout = cute.make_layout((1, async_copy_elems))
+        # gmem_tiled_copy_QK: tiled copy for QK load
+        self.gmem_tiled_copy_QK = cute.make_tiled_copy_tv(atom_async_copy, tQK_layout, vQKVdO_layout)
+        self.gmem_tiled_copy_VdO = cute.make_tiled_copy_tv(atom_async_copy, tVdO_layout, vQKVdO_layout)
+        self.gmem_tiled_copy_dK = cute.make_tiled_copy_tv(atom_universal_copy, tQK_layout, vQKVdO_layout)
+        self.gmem_tiled_copy_dV = cute.make_tiled_copy_tv(atom_universal_copy, tVdO_layout, vQKVdO_layout)
+        async_copy_elems_accum = universal_copy_bits // cutlass.Float32.width
+        # I think we wouldn't require this with smarter padding
+        if cutlass.const_expr(not self.varlen_q):
+            async_copy_elems_accum = universal_copy_bits // cutlass.Float32.width
+            atom_async_copy_accum = cute.make_copy_atom(
+                cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+                cutlass.Float32,
+                num_bits_per_copy=universal_copy_bits,
+            )
+        else:
+            async_copy_elems_accum = 1
+            atom_async_copy_accum = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                cutlass.Float32,
+                num_bits_per_copy=cutlass.Float32.width,
+            )
+        self.gmem_tiled_copy_LSE = cute.make_tiled_copy_tv(
+            atom_async_copy_accum,
+            cute.make_layout(self.num_threads),
+            cute.make_layout(async_copy_elems_accum),
+        )
+        self.gmem_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
+            cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(), cutlass.Float32, num_bits_per_copy=cutlass.Float32.width
+            ),
+            cute.make_layout(self.num_threads),
+            cute.make_layout(1)
+        )
+        if cutlass.const_expr(self.qhead_per_kvhead > 1):
+            self.gmem_tiled_copy_dK = self.gmem_tiled_copy_dQaccum
+            self.gmem_tiled_copy_dV = self.gmem_tiled_copy_dQaccum
+    def _get_tiled_mma(self):
+        num_mma_warps = self.num_threads // 32
+        AtomLayoutSdP = (self.AtomLayoutMSdP, num_mma_warps // self.AtomLayoutMSdP, 1) if cutlass.const_expr(not self.SdP_swapAB) else (num_mma_warps // self.AtomLayoutMSdP, self.AtomLayoutMSdP, 1)
+        tiled_mma_sdp = cute.make_tiled_mma(
+            warp.MmaF16BF16Op(self.dtype, cutlass.Float32, (16, 8, 16)),
+            AtomLayoutSdP,
+            permutation_mnk=(AtomLayoutSdP[0] * 16, AtomLayoutSdP[1] * 16, 16),
+        )
+        AtomLayoutdKV = (self.AtomLayoutNdKV, num_mma_warps // self.AtomLayoutNdKV, 1) if cutlass.const_expr(not self.dKV_swapAB) else (num_mma_warps // self.AtomLayoutNdKV, self.AtomLayoutNdKV, 1)
+        tiled_mma_dkv = cute.make_tiled_mma(
+            warp.MmaF16BF16Op(self.dtype, cutlass.Float32, (16, 8, 16)),
+            AtomLayoutdKV,
+            permutation_mnk=(AtomLayoutdKV[0] * 16, AtomLayoutdKV[1] * 16, 16),
+        )
+        AtomLayoutdQ = (self.AtomLayoutMdQ, num_mma_warps // self.AtomLayoutMdQ, 1) if cutlass.const_expr(not self.dQ_swapAB) else (num_mma_warps // self.AtomLayoutMdQ, self.AtomLayoutMdQ, 1)
+        tiled_mma_dq = cute.make_tiled_mma(
+            warp.MmaF16BF16Op(self.dtype, cutlass.Float32, (16, 8, 16)),
+            AtomLayoutdQ,
+            permutation_mnk=(AtomLayoutdQ[0] * 16, AtomLayoutdQ[1] * 16, 16),
+        )
+        return tiled_mma_sdp, tiled_mma_dkv, tiled_mma_dq
+    def _get_shared_storage_cls(self):
+        sQ_struct, sK_struct, sV_struct, sdO_struct = [
+            cute.struct.Align[cute.struct.MemRange[self.dtype, cute.cosize(layout)], 1024]
+            for layout in (self.sQ_layout, self.sK_layout, self.sV_layout, self.sdO_layout)
+        ]
+        cosize_sQV = max(cute.cosize(self.sQ_layout), cute.cosize(self.sV_layout))
+        sQV_struct = cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sQV], 1024]
+        sLSE_struct, sdPsum_struct = [
+            cute.struct.Align[cute.struct.MemRange[cutlass.Float32, cute.cosize(layout)], 128]
+            for layout in (self.sLSE_layout, self.sLSE_layout)
+        ]
+        sP_struct, sdS_struct = [
+            cute.struct.Align[cute.struct.MemRange[self.dtype, cute.cosize(layout)], 128]
+            for layout in (self.sPdS_layout, self.sPdS_layout)
+        ]
+        @cute.struct
+        class SharedStorageSeparateQV:
+            sK: sK_struct
+            sV: sV_struct
+            sQ: sQ_struct
+            sdO: sdO_struct
+            sLSE: sLSE_struct
+            sdPsum: sdPsum_struct
+            sP: sP_struct
+            sdS: sdS_struct
+            # TODO: the case where there's no sP
+        @cute.struct
+        class SharedStorageSharedQV:
+            sK: sK_struct
+            sV: sV_struct
+            sQ: sQV_struct
+            sdO: sdO_struct
+            sLSE: sLSE_struct
+            sdPsum: sdPsum_struct
+            sP: sP_struct
+            sdS: sdS_struct
+        return SharedStorageSeparateQV if cutlass.const_expr(not self.share_QV_smem) else SharedStorageSharedQV
+    @cute.jit
+    def __call__(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mdO: cute.Tensor,
+        mLSE: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mdQaccum: cute.Tensor,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        softmax_scale: cutlass.Float32,
+        stream: cuda.CUstream,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        softcap: Float32 | float | None = None,
+        window_size_left: Int32 | int | None = None,
+        window_size_right: Int32 | int | None = None,
+        mdQ_semaphore: Optional[cute.Tensor] = None,
+    ):
+        assert mdQ_semaphore is None, "semaphore not supported yet"
+        # Get the data type and check if it is fp16 or bf16
+        self._check_type(*(t.element_type if t is not None else None
+                           for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK)))
+        mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV = [
+            assume_tensor_aligned(t) for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV)
+        ]
+        self.varlen_q = (mCuSeqlensQ is not None)
+        self._setup_attributes()
+        SharedStorage = self._get_shared_storage_cls()
+        tiled_mma_sdp, tiled_mma_dkv, tiled_mma_dq = self._get_tiled_mma()
+        num_head = mQ.shape[1] if cutlass.const_expr(mCuSeqlensQ is not None) else mQ.shape[2]
+        if cutlass.const_expr(mCuSeqlensK is not None):
+            TileScheduler = SingleTileVarlenScheduler
+            num_batch = mCuSeqlensK.shape[0] - 1
+        else:
+            TileScheduler = SingleTileScheduler
+            num_batch = mK.shape[0]
+        # Uses seqlen k, etc. since main bwd kernel's blocks are over n
+        tile_sched_args = TileSchedulerArguments(
+            num_block=cute.ceil_div(mK.shape[1], self.n_block_size),
+            num_head=num_head,
+            num_batch=num_batch,
+            num_splits=1,
+            seqlen_k=0,
+            headdim=mK.shape[2],
+            headdim_v=mV.shape[2],
+            total_q=mK.shape[0],
+            tile_shape_mn=(self.n_block_size, self.m_block_size),
+            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if cutlass.const_expr(self.pack_gqa) else 1,
+            mCuSeqlensQ=mCuSeqlensK,
+            mSeqUsedQ=mSeqUsedK,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        softmax_scale_log2 = softmax_scale * math.log2(math.e)
+        self.kernel(
+            mQ,
+            mK,
+            mV,
+            mdO,
+            mLSE,
+            mdPsum,
+            mdQaccum,
+            mdK,
+            mdV,
+            mCuSeqlensQ,
+            mCuSeqlensK,
+            mSeqUsedQ,
+            mSeqUsedK,
+            softmax_scale,
+            softmax_scale_log2,
+            self.sQ_layout,
+            self.sK_layout,
+            self.sV_layout,
+            self.sdO_layout,
+            self.sPdS_layout,
+            self.sLSE_layout,
+            self.sLSEMma_layout,
+            self.gmem_tiled_copy_QK,
+            self.gmem_tiled_copy_VdO,
+            self.gmem_tiled_copy_dK,
+            self.gmem_tiled_copy_dV,
+            self.gmem_tiled_copy_LSE,
+            self.gmem_tiled_copy_dQaccum,
+            tiled_mma_sdp,
+            tiled_mma_dkv,
+            tiled_mma_dq,
+            SharedStorage,
+            tile_sched_params,
+            TileScheduler,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            smem=SharedStorage.size_in_bytes(),
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mdO: cute.Tensor,
+        mLSE: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mdQaccum: cute.Tensor,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mCuSeqlensK: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        mSeqUsedK: Optional[cute.Tensor],
+        softmax_scale: cutlass.Float32,
+        softmax_scale_log2: cutlass.Float32,
+        sQ_layout: cute.ComposedLayout,
+        sK_layout: cute.ComposedLayout,
+        sV_layout: cute.ComposedLayout,
+        sdO_layout: cute.ComposedLayout,
+        sPdS_layout: cute.ComposedLayout,
+        sLSE_layout: cute.Layout,
+        sLSEMma_layout: cute.Layout,
+        gmem_tiled_copy_QK: cute.TiledCopy,
+        gmem_tiled_copy_VdO: cute.TiledCopy,
+        gmem_tiled_copy_dK: cute.TiledCopy,
+        gmem_tiled_copy_dV: cute.TiledCopy,
+        gmem_tiled_copy_LSE: cute.TiledCopy,
+        gmem_tiled_copy_dQaccum: cute.TiledCopy,
+        tiled_mma_sdp: cute.TiledMma,
+        tiled_mma_dkv: cute.TiledMma,
+        tiled_mma_dq: cute.TiledMma,
+        SharedStorage: cutlass.Constexpr,
+        tile_sched_params: ParamsBase,
+        TileScheduler: cutlass.Constexpr[Callable],
+    ):
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        tile_scheduler = TileScheduler.create(tile_sched_params)
+        work_tile = tile_scheduler.initial_work_tile_info()
+        n_block, head_idx, batch_idx, _ = work_tile.tile_idx
+        if work_tile.is_valid_tile:
+            seqlen = SeqlenInfoQK.create(batch_idx, mQ.shape[1], mK.shape[1], mCuSeqlensQ=mCuSeqlensQ, mCuSeqlensK=mCuSeqlensK, mSeqUsedQ=mSeqUsedQ, mSeqUsedK=mSeqUsedK)
+            m_block_max = cute.ceil_div(seqlen.seqlen_q, self.m_block_size)
+            m_block_min = 0
+            if cutlass.const_expr(self.is_causal):
+                m_block_min = max(
+                    (n_block * self.n_block_size + seqlen.seqlen_q - seqlen.seqlen_k) // self.m_block_size,
+                    m_block_min,
+                )
+            # TODO: return early if m_block_max == 0
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Get the appropriate tiles for this thread block.
+            # ///////////////////////////////////////////////////////////////////////////////
+            blkQ_shape = (self.m_block_size, self.head_dim_padded)
+            blkK_shape = (self.n_block_size, self.head_dim_padded)
+            blkV_shape = (self.n_block_size, self.head_dim_v_padded)
+            blkdO_shape = (self.m_block_size, self.head_dim_v_padded)
+            if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
+                mQ_cur = mQ[batch_idx, None, head_idx, None]
+                mLSE_cur = mLSE[batch_idx, head_idx, None]
+                mdO_cur = mdO[batch_idx, None, head_idx, None]
+                mdPsum_cur = mdPsum[batch_idx, head_idx, None]
+                mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
+            else:
+                padded_offset_q = seqlen.offset_q + batch_idx * self.m_block_size
+                mQ_cur = cute.domain_offset((seqlen.offset_q, 0), mQ[None, head_idx, None])
+                mLSE_cur = cute.domain_offset((padded_offset_q,), mLSE[head_idx, None])
+                mdO_cur = cute.domain_offset((seqlen.offset_q, 0), mdO[None, head_idx, None])
+                mdPsum_cur = cute.domain_offset((padded_offset_q,), mdPsum[head_idx, None])
+                mdQaccum_cur = cute.domain_offset((padded_offset_q * self.head_dim_padded,), mdQaccum[head_idx, None])
+            head_idx_kv = head_idx // self.qhead_per_kvhead if cutlass.const_expr(not self.pack_gqa) else head_idx
+            if cutlass.const_expr(not seqlen.has_cu_seqlens_k):
+                mK_cur, mV_cur = [t[batch_idx, None, head_idx_kv, None] for t in (mK, mV)]
+            else:
+                mK_cur, mV_cur = [cute.domain_offset((seqlen.offset_k, 0), t[None, head_idx_kv, None]) for t in (mK, mV)]
+            # (m_block_size, head_dim, m_block)
+            gQ = cute.local_tile(mQ_cur, blkQ_shape, (None, 0))
+            # (n_block_size, head_dim)
+            gK = cute.local_tile(mK_cur, blkK_shape, (n_block, 0))
+            # (n_block_size, head_dim_v)
+            gV = cute.local_tile(mV_cur, blkV_shape, (n_block, 0))
+            # (m_block_size, head_dim_v, m_block)
+            gdO = cute.local_tile(mdO_cur, blkdO_shape, (None, 0))
+            gLSE = cute.local_tile(mLSE_cur, (self.m_block_size,), (None,))
+            gdPsum = cute.local_tile(mdPsum_cur, (self.m_block_size,), (None,))
+            gdQaccum = cute.local_tile(mdQaccum_cur, (self.m_block_size * self.head_dim_padded,), (None,))
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Get shared memory buffer
+            # ///////////////////////////////////////////////////////////////////////////////
+            smem = cutlass.utils.SmemAllocator()
+            storage = smem.allocate(SharedStorage)
+            sQ = storage.sQ.get_tensor(sQ_layout)
+            sK = storage.sK.get_tensor(sK_layout)
+            if cutlass.const_expr(not self.share_QV_smem):
+                sV = storage.sV.get_tensor(sV_layout)
+            else:
+                sV = cute.make_tensor(cute.recast_ptr(sQ.iterator, dtype=self.dtype), sV_layout)
+            sdO = storage.sdO.get_tensor(sdO_layout)
+            sP = storage.sP.get_tensor(sPdS_layout)
+            sdS = storage.sdS.get_tensor(sPdS_layout)
+            sLSE = storage.sLSE.get_tensor(sLSE_layout)
+            sdPsum = storage.sdPsum.get_tensor(sLSE_layout)
+            sLSEMma = storage.sLSE.get_tensor(sLSEMma_layout)
+            sdPsumMma = storage.sdPsum.get_tensor(sLSEMma_layout)
+            # Transpose view of tensors for tiled mma
+            sQt, sdOt, sKt, sPt, sdSt = [layout_utils.transpose_view(t) for t in (sQ, sdO, sK, sP, sdS)]
+            gmem_thr_copy_QK = gmem_tiled_copy_QK.get_slice(tidx)
+            gmem_thr_copy_VdO = gmem_tiled_copy_VdO.get_slice(tidx)
+            gmem_thr_copy_lse = gmem_tiled_copy_LSE.get_slice(tidx)
+            gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_slice(tidx)
+            # (CPY_Atom, CPY_M, CPY_K, m_block)
+            tQgQ = gmem_thr_copy_QK.partition_S(gQ)
+            tQsQ = gmem_thr_copy_QK.partition_D(sQ)
+            # (CPY_Atom, CPY_N, CPY_K)
+            tKgK = gmem_thr_copy_QK.partition_S(gK)
+            tKsK = gmem_thr_copy_QK.partition_D(sK)
+            # (CPY_Atom, CPY_N, CPY_K)
+            tVgV = gmem_thr_copy_VdO.partition_S(gV)
+            tVsV = gmem_thr_copy_VdO.partition_D(sV)
+            # (CPY_Atom, CPY_M, CPY_K, m_block)
+            tdOgdO = gmem_thr_copy_VdO.partition_S(gdO)
+            tdOsdO = gmem_thr_copy_VdO.partition_D(sdO)
+            tLSEgLSE = gmem_thr_copy_lse.partition_S(gLSE)
+            tLSEsLSE = gmem_thr_copy_lse.partition_D(sLSE)
+            tLSEgdPsum = gmem_thr_copy_lse.partition_S(gdPsum)
+            tLSEsdPsum = gmem_thr_copy_lse.partition_D(sdPsum)
+            tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Tile MMA compute thread partitions and allocate accumulators
+            # ///////////////////////////////////////////////////////////////////////////////
+            thr_mma_sdp = tiled_mma_sdp.get_slice(tidx)
+            thr_mma_dkv = tiled_mma_dkv.get_slice(tidx)
+            thr_mma_dq = tiled_mma_dq.get_slice(tidx)
+            acc_shape_dK = thr_mma_dkv.partition_shape_C((self.n_block_size, self.head_dim_padded))
+            acc_shape_dV = thr_mma_dkv.partition_shape_C((self.n_block_size, self.head_dim_v_padded))
+            acc_dK = cute.make_fragment(acc_shape_dK, cutlass.Float32)
+            acc_dV = cute.make_fragment(acc_shape_dV, cutlass.Float32)
+            acc_dK.fill(0.0)
+            acc_dV.fill(0.0)
+            tSrQ = utils.mma_make_fragment_A(sQ[None, None, 0], thr_mma_sdp, swapAB=self.SdP_swapAB)
+            tSrK = utils.mma_make_fragment_B(sK, thr_mma_sdp, swapAB=self.SdP_swapAB)
+            tdPrdO = utils.mma_make_fragment_A(sdO[None, None, 0], thr_mma_sdp, swapAB=self.SdP_swapAB)
+            tdPrV = utils.mma_make_fragment_B(sV, thr_mma_sdp, swapAB=self.SdP_swapAB)
+            tdVrP = utils.mma_make_fragment_A(sPt, thr_mma_dkv, swapAB=self.dKV_swapAB)
+            tdVrdO = utils.mma_make_fragment_B(sdOt[None, None, 0], thr_mma_dkv, swapAB=self.dKV_swapAB)
+            tdKrdS = utils.mma_make_fragment_A(sdSt, thr_mma_dkv, swapAB=self.dKV_swapAB)
+            tdKrQ = utils.mma_make_fragment_B(sQt[None, None, 0], thr_mma_dkv, swapAB=self.dKV_swapAB)
+            tdQrdS = utils.mma_make_fragment_A(sdS, thr_mma_dq, swapAB=self.dQ_swapAB)
+            tdQrK = utils.mma_make_fragment_B(sKt, thr_mma_dq, swapAB=self.dQ_swapAB)
+            LSEslice = (None, 0, None) if cutlass.const_expr(not self.SdP_swapAB) else (0, None, None)
+            tSsLSEMma = layout_utils.reshape_acc_to_mn(thr_mma_sdp.partition_C(sLSEMma))[LSEslice]
+            tSsdPsumMma = layout_utils.reshape_acc_to_mn(thr_mma_sdp.partition_C(sdPsumMma))[LSEslice]
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Smem copy atom tiling
+            # ///////////////////////////////////////////////////////////////////////////////
+            smem_copy_atom = cute.make_copy_atom(
+                warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4), self.dtype,
+            )
+            smem_copy_atom_transposed = cute.make_copy_atom(
+                warp.LdMatrix8x8x16bOp(transpose=True, num_matrices=4), self.dtype,
+            )
+            smem_thr_copy_QdO = utils.make_tiled_copy_A(
+                smem_copy_atom, tiled_mma_sdp, swapAB=self.SdP_swapAB
+            ).get_slice(tidx)
+            smem_thr_copy_KV = utils.make_tiled_copy_B(
+                smem_copy_atom, tiled_mma_sdp, swapAB=self.SdP_swapAB
+            ).get_slice(tidx)
+            # TODO: should this be smem_copy_atom_transposed?
+            smem_thr_copy_PdSt = utils.make_tiled_copy_A(
+                smem_copy_atom_transposed, tiled_mma_dkv, swapAB=self.dKV_swapAB
+            ).get_slice(tidx)
+            smem_thr_copy_QdOt = utils.make_tiled_copy_B(
+                smem_copy_atom_transposed, tiled_mma_dkv, swapAB=self.dKV_swapAB
+            ).get_slice(tidx)
+            smem_thr_copy_dS = utils.make_tiled_copy_A(
+                smem_copy_atom, tiled_mma_dq, swapAB=self.dQ_swapAB
+            ).get_slice(tidx)
+            smem_thr_copy_Kt = utils.make_tiled_copy_B(
+                smem_copy_atom_transposed, tiled_mma_dq, swapAB=self.dQ_swapAB
+            ).get_slice(tidx)
+            # TODO: what's the number of bits? What if SdP_swapAB
+            r2s_thr_copy_PdS = cute.make_tiled_copy_C(
+                cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), self.dtype, num_bits_per_copy=2 * self.dtype.width
+                ),
+                tiled_mma_sdp,
+            ).get_slice(tidx)
+            tSsQ = smem_thr_copy_QdO.partition_S(sQ)
+            tdPsdO = smem_thr_copy_QdO.partition_S(sdO)
+            tSsK = smem_thr_copy_KV.partition_S(sK)
+            tdPsV = smem_thr_copy_KV.partition_S(sV)
+            tdVsPt = smem_thr_copy_PdSt.partition_S(sPt)
+            tdKsdSt = smem_thr_copy_PdSt.partition_S(sdSt)
+            tdVsdOt = smem_thr_copy_QdOt.partition_S(sdOt)
+            tdKsQt = smem_thr_copy_QdOt.partition_S(sQt)
+            tdQsdS = smem_thr_copy_dS.partition_S(sdS)
+            tdQsKt = smem_thr_copy_Kt.partition_S(sKt)
+            tPsP = r2s_thr_copy_PdS.partition_D(sP)
+            tdSsdS = r2s_thr_copy_PdS.partition_D(sdS)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
+            # of tile_shape
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Construct identity layout for KV
+            cQ = cute.make_identity_tensor((self.m_block_size, self.head_dim_padded))
+            tQcQ = gmem_thr_copy_QK.partition_S(cQ)
+            t0QcQ = gmem_thr_copy_QK.get_slice(0).partition_S(cQ)
+            if cutlass.const_expr(self.head_dim_padded == self.head_dim_v_padded):
+                tdOcdO = tQcQ
+                t0dOcdO = t0QcQ
+            else:
+                cdO = cute.make_identity_tensor((self.m_block_size, self.head_dim_v_padded))
+                tdOcdO = gmem_thr_copy_VdO.partition_S(cdO)
+                t0dOcdO = gmem_thr_copy_VdO.get_slice(0).partition_S(cdO)
+            cLSE = cute.make_identity_tensor((self.m_block_size,))
+            tLSEcLSE = gmem_thr_copy_lse.partition_S(cLSE)
+            # Allocate predicate tensors for m and n, here we only allocate the tile of k, and
+            # use "if" on the mn dimension.
+            # This is to reduce register pressure and gets 2-3% performance gain.
+            d_head = mQ.shape[cute.rank(mQ) - 1]
+            d_head_v = mdO.shape[cute.rank(mdO) - 1]
+            tQpQ = utils.predicate_k(tQcQ, limit=d_head)
+            if cutlass.const_expr(self.same_hdim_kv):
+                tdOpdO = tQpQ
+            else:
+                tdOpdO = utils.predicate_k(tdOcdO, limit=d_head_v)
+            # group parameters for compute_one_m_block
+            mma_params = SimpleNamespace(
+                thr_mma_sdp=thr_mma_sdp, thr_mma_dkv=thr_mma_dkv, thr_mma_dq=thr_mma_dq,
+                tSrQ=tSrQ, tSrK=tSrK, tdPrdO=tdPrdO, tdPrV=tdPrV,
+                tdVrP=tdVrP, tdVrdO=tdVrdO, tdKrdS=tdKrdS, tdKrQ=tdKrQ,
+                tdQrdS=tdQrdS, tdQrK=tdQrK,
+                acc_dK=acc_dK, acc_dV=acc_dV,
+            )
+            smem_copy_params = SimpleNamespace(
+                smem_thr_copy_QdO=smem_thr_copy_QdO,
+                smem_thr_copy_KV=smem_thr_copy_KV,
+                smem_thr_copy_PdSt=smem_thr_copy_PdSt,
+                smem_thr_copy_QdOt=smem_thr_copy_QdOt,
+                smem_thr_copy_dS=smem_thr_copy_dS,
+                smem_thr_copy_Kt=smem_thr_copy_Kt,
+                r2s_thr_copy_PdS=r2s_thr_copy_PdS,
+                tSsQ=tSsQ, tSsK=tSsK, tdPsdO=tdPsdO, tdPsV=tdPsV,
+                tSsLSEMma=tSsLSEMma, tSsdPsumMma=tSsdPsumMma,
+                tPsP=tPsP, tdSsdS=tdSsdS,
+                tdVsPt=tdVsPt, tdVsdOt=tdVsdOt, tdKsdSt=tdKsdSt, tdKsQt=tdKsQt,
+                tdQsdS=tdQsdS, tdQsKt=tdQsKt,
+            )
+            gmem_copy_params = SimpleNamespace(
+                gmem_thr_copy_dQaccum=gmem_thr_copy_dQaccum, tdQgdQaccum=tdQgdQaccum
+            )
+            load_Q_LSE = partial(
+                self.load_Q_LSE, gmem_tiled_copy_QK, gmem_tiled_copy_LSE,
+                tQgQ, tQsQ, tQcQ, t0QcQ, tQpQ,
+                tLSEgLSE, tLSEsLSE, tLSEcLSE, seqlen=seqlen.seqlen_q
+            )
+            load_dO_dPsum = partial(
+                self.load_dO_dPsum, gmem_tiled_copy_VdO, gmem_tiled_copy_LSE,
+                tdOgdO, tdOsdO, tdOcdO, t0dOcdO, tdOpdO,
+                tLSEgdPsum, tLSEsdPsum, tLSEcLSE, seqlen=seqlen.seqlen_q
+            )
+            compute_one_m_block = partial(
+                self.compute_one_m_block, mma_params=mma_params,
+                smem_copy_params=smem_copy_params, gmem_copy_params=gmem_copy_params,
+                load_Q_LSE=load_Q_LSE, load_dO_dPsum=load_dO_dPsum,
+                m_block_max=m_block_max,
+                softmax_scale_log2=softmax_scale_log2,
+            )
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Prologue
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Start async loads of the last mn-tile, where we take care of the mn residue
+            self.load_V(gmem_thr_copy_VdO, tVgV, tVsV, n_block, seqlen=seqlen.seqlen_k,
+                        headdim=d_head_v)
+            if cutlass.const_expr(self.V_in_regs):
+                cute.arch.cp_async_commit_group()
+            self.load_K(gmem_thr_copy_QK, tKgK, tKsK, n_block, seqlen=seqlen.seqlen_k,
+                        headdim=d_head)
+            cute.arch.cp_async_commit_group()
+            if cutlass.const_expr(self.V_in_regs):
+                cute.arch.cp_async_wait_group(1)
+                cute.arch.barrier()
+                tdPrV_copy_view = smem_thr_copy_KV.retile(tdPrV)
+                cute.copy(smem_thr_copy_KV, tdPsV, tdPrV_copy_view)
+                # Sync to avoid loading Q to smem_q, which overlaps with smem_v
+                cute.arch.barrier()
+            m_block = m_block_min
+            assert self.num_stages_Q >= self.num_stages_dO
+            for stage in cutlass.range_constexpr(self.num_stages_Q):
+                if cutlass.const_expr(self.num_stages_Q == 1 or stage < self.num_stages_Q - 1):
+                    if stage == 0 or m_block + stage < m_block_max:
+                        load_Q_LSE(m_block + stage, smem_pipe_write_q=stage)
+                    cute.arch.cp_async_commit_group()
+                if cutlass.const_expr(stage < self.num_stages_dO):
+                    if stage == 0 or m_block + stage < m_block_max:
+                        load_dO_dPsum(m_block + stage, smem_pipe_write_q=stage)
+                    cute.arch.cp_async_commit_group()
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Mainloop
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Start processing of the first n-block.
+            mask = AttentionMask(self.m_block_size, self.n_block_size, seqlen.seqlen_q, seqlen.seqlen_k)
+            mask_fn = partial(
+                mask.apply_mask, n_block=n_block, thr_mma=thr_mma_sdp,
+                mask_seqlen=True, mask_causal=self.is_causal
+            )
+            smem_pipe_read_q = cutlass.Int32(0)
+            smem_pipe_read_do = cutlass.Int32(0)
+            smem_pipe_write_q = cutlass.Int32(self.num_stages_Q - 1)
+            smem_pipe_write_do = cutlass.Int32(0)
+            for m_tile in cutlass.range(m_block_min, m_block_max, unroll=1):
+                compute_one_m_block(
+                    m_tile, smem_pipe_read_q, smem_pipe_read_do, smem_pipe_write_q, smem_pipe_write_do,
+                    mask_fn=mask_fn,
+                )
+                smem_pipe_read_q = self.advance_pipeline(smem_pipe_read_q, self.num_stages_Q)
+                smem_pipe_read_do = self.advance_pipeline(smem_pipe_read_do, self.num_stages_dO)
+                smem_pipe_write_q = self.advance_pipeline(smem_pipe_write_q, self.num_stages_Q)
+                smem_pipe_write_do = self.advance_pipeline(smem_pipe_write_do, self.num_stages_dO)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Epilogue
+            # ///////////////////////////////////////////////////////////////////////////////
+            # If GQA, we scale dK in the postprocessing kernel instead
+            if cutlass.const_expr(self.qhead_per_kvhead == 1):
+                acc_dK.store(acc_dK.load() * softmax_scale)
+            # reuse sK and sV data iterator
+            sdK = cute.make_tensor(sK.iterator, sK_layout)
+            sdV = cute.make_tensor(sV.iterator, sV_layout)
+            self.epilogue(
+                acc_dK, acc_dV, mdK, mdV, sdK, sdV,
+                gmem_tiled_copy_dK, gmem_tiled_copy_dV, tiled_mma_dkv,
+                tidx, n_block, head_idx, batch_idx, seqlen, d_head, d_head_v
+            )
+    @cute.jit
+    def compute_one_m_block(
+        self,
+        m_block: cutlass.Int32,
+        smem_pipe_read_q: cutlass.Int32,
+        smem_pipe_read_do: cutlass.Int32,
+        smem_pipe_write_q: cutlass.Int32,
+        smem_pipe_write_do: cutlass.Int32,
+        mma_params: SimpleNamespace,
+        smem_copy_params: SimpleNamespace,
+        gmem_copy_params: SimpleNamespace,
+        load_Q_LSE: Callable,
+        load_dO_dPsum: Callable,
+        m_block_max: cutlass.Int32,
+        softmax_scale_log2: cutlass.Float32,
+        mask_fn: Optional[Callable] = None,
+    ):
+        def load_Q_next():
+            m_block_next = m_block + (self.num_stages_Q - 1 if cutlass.const_expr(self.num_stages_Q > 1) else 1)
+            if m_block_next < m_block_max:
+                load_Q_LSE(m_block_next, smem_pipe_write_q)
+            cute.arch.cp_async_commit_group()
+        def load_dO_next():
+            if m_block + self.num_stages_dO < m_block_max:
+                load_dO_dPsum(m_block + self.num_stages_dO, smem_pipe_write_do)
+            cute.arch.cp_async_commit_group()
+        # MMA S
+        acc_shape_SdP = mma_params.thr_mma_sdp.partition_shape_C(
+            (self.m_block_size, self.n_block_size) if cutlass.const_expr(not self.SdP_swapAB) else (self.n_block_size, self.m_block_size)
+        )
+        acc_S = cute.make_fragment(acc_shape_SdP, cutlass.Float32)
+        acc_S.fill(0.0)
+        cute.arch.cp_async_wait_group(1 if cutlass.const_expr(self.num_stages_Q > 1) else 0)
+        cute.arch.barrier()
+        sm80_utils.gemm(
+            mma_params.thr_mma_sdp, acc_S, mma_params.tSrQ, mma_params.tSrK,
+            smem_copy_params.tSsQ[None, None, None, smem_pipe_read_q if cutlass.const_expr(self.num_stages_Q > 1) else 0],
+            smem_copy_params.tSsK,
+            smem_copy_params.smem_thr_copy_QdO, smem_copy_params.smem_thr_copy_KV,
+            swap_AB=self.SdP_swapAB,
+        )
+        tLSErLSE = cute.make_fragment_like(smem_copy_params.tSsLSEMma[None, 0])
+        cute.autovec_copy(
+            smem_copy_params.tSsLSEMma[None, smem_pipe_read_q if cutlass.const_expr(self.num_stages_Q > 1) else 0], tLSErLSE
+        )
+        if cutlass.const_expr(mask_fn is not None):
+            mask_fn(acc_S, m_block=m_block)
+        acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S)
+        bidx = 0
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(acc_S_mn)
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == 1: cute.print_tensor(tLSErLSE)
+        assert cute.size(acc_S_mn, mode=[0]) == cute.size(tLSErLSE)
+        for r in cutlass.range(cute.size(acc_S_mn, mode=[0]), unroll_full=True):
+            acc_S_mn[r, None].store(cute.math.exp2(acc_S_mn[r, None].load() * softmax_scale_log2 - tLSErLSE[r], fastmath=True))
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(acc_S_mn)
+        # MMA dP
+        acc_dP = cute.make_fragment(acc_shape_SdP, cutlass.Float32)
+        acc_dP.fill(0.0)
+        cute.arch.cp_async_wait_group(1 if cutlass.const_expr(self.num_stages_dO > 1) else 0)
+        cute.arch.barrier()
+        sm80_utils.gemm(
+            mma_params.thr_mma_sdp, acc_dP, mma_params.tdPrdO, mma_params.tdPrV,
+            smem_copy_params.tdPsdO[None, None, None, smem_pipe_read_do if cutlass.const_expr(self.num_stages_dO > 1) else 0],
+            smem_copy_params.tdPsV,
+            smem_copy_params.smem_thr_copy_QdO, smem_copy_params.smem_thr_copy_KV,
+            hook_fn=load_Q_next if cutlass.const_expr(self.num_stages_Q > 1) else None,
+            swap_AB=self.SdP_swapAB,
+        )
+        tLSErdPsum = cute.make_fragment_like(smem_copy_params.tSsdPsumMma[None, 0])
+        cute.autovec_copy(
+            smem_copy_params.tSsdPsumMma[None, smem_pipe_read_do if cutlass.const_expr(self.num_stages_dO > 1) else 0], tLSErdPsum
+        )
+        acc_dP_mn = layout_utils.reshape_acc_to_mn(acc_dP)
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(acc_dP_mn)
+        assert cute.size(acc_dP_mn, mode=[0]) == cute.size(tLSErdPsum)
+        for r in cutlass.range(cute.size(acc_dP_mn, mode=[0]), unroll_full=True):
+            acc_dP_mn[r, None].store(acc_S_mn[r, None].load() * (acc_dP_mn[r, None].load() - tLSErdPsum[r]))
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(acc_dP_mn)
+        rP = cute.make_fragment_like(acc_S, self.dtype)
+        rP.store(acc_S.load().to(self.dtype))
+        if cutlass.const_expr(not self.Mma_dKV_is_RS):
+            tPrP = smem_copy_params.r2s_thr_copy_PdS.retile(rP)  # ((Atom,AtomNum), MMA_N, MMA_N)
+            cute.copy(smem_copy_params.r2s_thr_copy_PdS, tPrP, smem_copy_params.tPsP)
+        rdS = cute.make_fragment_like(acc_dP, self.dtype)
+        rdS.store(acc_dP.load().to(self.dtype))
+        if cutlass.const_expr(not self.Mma_dKV_is_RS):
+            cute.arch.barrier()  # Make sure P is written
+        # For hdim 64, It's faster to write to smem_dS first before the dV gemm
+        if cutlass.const_expr(not self.Mma_dKV_is_RS):
+            tdSrdS = smem_copy_params.r2s_thr_copy_PdS.retile(rdS)
+            cute.copy(smem_copy_params.r2s_thr_copy_PdS, tdSrdS, smem_copy_params.tdSsdS)
+        if cutlass.const_expr(self.Mma_dKV_is_RS):
+            tdVrP = layout_utils.reshape_acc_to_frgA(rP)
+        else:
+            tdVrP = mma_params.tdVrP
+        # MMA dK
+        sm80_utils.gemm(
+            mma_params.thr_mma_dkv, mma_params.acc_dV, tdVrP, mma_params.tdVrdO,
+            smem_copy_params.tdVsPt,
+            smem_copy_params.tdVsdOt[None, None, None, smem_pipe_read_do if cutlass.const_expr(self.num_stages_dO > 1) else 0],
+            smem_copy_params.smem_thr_copy_PdSt, smem_copy_params.smem_thr_copy_QdOt,
+            A_in_regs=self.Mma_dKV_is_RS,
+            swap_AB=self.dKV_swapAB,
+        )
+        # if cute.arch.thread_idx()[0] == 0 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(mma_params.acc_dV)
+        cute.arch.barrier()  # Make sure dS is written
+        # MMA dQ
+        def dQ_mma(hook_fn):
+            acc_shape_dQ = mma_params.thr_mma_dq.partition_shape_C(
+                (self.m_block_size, self.head_dim_padded) if cutlass.const_expr(not self.dQ_swapAB) else (self.head_dim_padded, self.m_block_size)
+            )
+            acc_dQ = cute.make_fragment(acc_shape_dQ, cutlass.Float32)
+            acc_dQ.fill(0.0)
+            sm80_utils.gemm(
+                mma_params.thr_mma_dq, acc_dQ, mma_params.tdQrdS, mma_params.tdQrK,
+                smem_copy_params.tdQsdS, smem_copy_params.tdQsKt,
+                smem_copy_params.smem_thr_copy_dS, smem_copy_params.smem_thr_copy_Kt,
+                swap_AB=self.dQ_swapAB,
+                hook_fn=hook_fn
+            )
+            # ((1, 1), num_elements)
+            acc_dQ_atomic = gmem_copy_params.gmem_thr_copy_dQaccum.retile(acc_dQ)
+            tdQgdQaccum_atomic = gmem_copy_params.tdQgdQaccum[None, None, m_block]
+            assert cute.size(acc_dQ_atomic) == cute.size(tdQgdQaccum_atomic)
+            for i in cutlass.range(cute.size(acc_dQ_atomic), unroll_full=True):
+                utils.atomic_add_fp32(acc_dQ_atomic[i], utils.elem_pointer(tdQgdQaccum_atomic, i))
+                # utils.atomic_add_fp32(acc_dQ[i], tdQgdQaccum_atomic.iterator + i * tdQgdQaccum_atomic.stride[1])
+            # if cute.arch.thread_idx()[0] == 64 and cute.arch.block_idx()[0] == bidx: cute.print_tensor(acc_dQ)
+        # If num_stages_Q == 1, we want to do Mma_dK first so we can start loading Q for the next iteration
+        if cutlass.const_expr(self.num_stages_Q > 1):
+            dQ_mma(load_dO_next)
+        # MMA dK
+        if cutlass.const_expr(self.Mma_dKV_is_RS):
+            tdVrP = layout_utils.reshape_acc_to_frgA(rdS)
+        else:
+            tdKrdS = mma_params.tdKrdS
+        sm80_utils.gemm(
+            mma_params.thr_mma_dkv, mma_params.acc_dK, tdKrdS, mma_params.tdKrQ,
+            smem_copy_params.tdKsdSt,
+            smem_copy_params.tdKsQt[None, None, None, smem_pipe_read_q if cutlass.const_expr(self.num_stages_Q > 1) else 0],
+            smem_copy_params.smem_thr_copy_PdSt, smem_copy_params.smem_thr_copy_QdOt,
+            A_in_regs=self.Mma_dKV_is_RS,
+            swap_AB=self.dKV_swapAB,
+            hook_fn=load_dO_next if cutlass.const_expr(self.num_stages_Q == 1) else None,
+        )
+        # if cute.arch.thread_idx()[0] == 0: cute.print_tensor(mma_params.acc_dK)
+        if cutlass.const_expr(self.num_stages_Q == 1):
+            cute.arch.barrier()
+            dQ_mma(load_Q_next)
+    @cute.jit
+    def epilogue(
+        self,
+        acc_dK: cute.Tensor,
+        acc_dV: cute.Tensor,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        sdK: cute.Tensor,
+        sdV: cute.Tensor,
+        gmem_tiled_copy_dK: cute.TiledCopy,
+        gmem_tiled_copy_dV: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        tidx: cutlass.Int32,
+        n_block: cutlass.Int32,
+        num_head: cutlass.Int32,
+        batch_size: cutlass.Int32,
+        seqlen: SeqlenInfoQK,
+        d_head: cutlass.Int32,
+        d_head_v: cutlass.Int32
+    ):
+        rdV = cute.make_fragment_like(acc_dV, self.dtype)
+        rdV.store(acc_dV.load().to(self.dtype))
+        rdK = cute.make_fragment_like(acc_dK, self.dtype)
+        rdK.store(acc_dK.load().to(self.dtype))
+        gmem_thr_copy_dK = gmem_tiled_copy_dK.get_slice(tidx)
+        gmem_thr_copy_dV = gmem_tiled_copy_dV.get_slice(tidx)
+        batch_idx = batch_size
+        head_idx_kv = num_head // self.qhead_per_kvhead if cutlass.const_expr(not self.pack_gqa) else num_head
+        if cutlass.const_expr(self.qhead_per_kvhead == 1):
+            # Make sure all threads have finished reading K and V, otherwise we get racy dQ
+            # because smem_q could be changed.
+            cute.arch.barrier()
+            # smem copy atom for dKV
+            smem_copy_atom_dKV = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(), self.dtype, num_bits_per_copy=2 * self.dtype.width
+            )
+            smem_thr_copy_dKV = cute.make_tiled_copy_C(smem_copy_atom_dKV, tiled_mma).get_slice(tidx)
+            taccdVrdV = smem_thr_copy_dKV.retile(rdV)
+            taccdKrdK = smem_thr_copy_dKV.retile(rdK)
+            taccdVsdV = smem_thr_copy_dKV.partition_D(sdV)
+            taccdKsdK = smem_thr_copy_dKV.partition_D(sdK)
+            # copy acc O from rmem to smem with the smem copy atom
+            cute.copy(smem_copy_atom_dKV, taccdVrdV, taccdVsdV)
+            cute.copy(smem_copy_atom_dKV, taccdKrdK, taccdKsdK)
+            if cutlass.const_expr(not seqlen.has_cu_seqlens_k):
+                mdK_cur, mdV_cur = [t[batch_idx, None, head_idx_kv, None] for t in (mdK, mdV)]
+            else:
+                mdK_cur, mdV_cur = [cute.domain_offset((seqlen.offset_k, 0), t[None, head_idx_kv, None]) for t in (mdK, mdV)]
+            blkdK_shape = (self.n_block_size, self.head_dim_padded)
+            blkdV_shape = (self.n_block_size, self.head_dim_v_padded)
+            gdK = cute.local_tile(mdK_cur, blkdK_shape, (n_block, 0))
+            gdV = cute.local_tile(mdV_cur, blkdV_shape, (n_block, 0))
+            tdKsdK = gmem_thr_copy_dK.partition_S(sdK)
+            tdKgdK = gmem_thr_copy_dK.partition_D(gdK)
+            tdVsdV = gmem_thr_copy_dV.partition_S(sdV)
+            tdVgdV = gmem_thr_copy_dV.partition_D(gdV)
+            tdKrdK = cute.make_fragment_like(tdKgdK, self.dtype)
+            tdVrdV = cute.make_fragment_like(tdVgdV, self.dtype)
+            # sync before all smem stores are done.
+            cute.arch.barrier()
+            # load acc dK and dV from smem to rmem for wider vectorization
+            # Need to check OOB when reading from smem if kBlockN isn't evenly tiled
+            # TODO
+            cute.autovec_copy(tdKsdK, tdKrdK)
+            cute.autovec_copy(tdVsdV, tdVrdV)
+            cdK = cute.make_identity_tensor((self.n_block_size, self.head_dim_padded))
+            tdKcdK = gmem_thr_copy_dK.partition_S(cdK)
+            t0dKcdK = gmem_tiled_copy_dK.get_slice(0).partition_S(cdK)
+            if cutlass.const_expr(self.head_dim_padded == self.head_dim_v_padded):
+                tdVcdV = tdKcdK
+                t0dVcdV = t0dKcdK
+            else:
+                cdV = cute.make_identity_tensor((self.n_block_size, self.head_dim_v_padded))
+                tdVcdV = gmem_thr_copy_dV.partition_S(cdV)
+                t0dVcdV = gmem_tiled_copy_dV.get_slice(0).partition_S(cdV)
+            tdKpdK = utils.predicate_k(tdKcdK, limit=d_head)
+            if cutlass.const_expr(self.same_hdim_kv):
+                tdVpdV = tdKpdK
+            else:
+                tdVpdV = utils.predicate_k(tdVcdV, limit=d_head_v)
+            # copy acc dK and acc_dV from rmem to gmem
+            for rest_m in cutlass.range_constexpr(cute.size(tdKrdK.shape[1])):
+                if t0dKcdK[0, rest_m, 0][0] < seqlen.seqlen_k - n_block * self.n_block_size - tdKcdK[0][0]:
+                    cute.copy(
+                        gmem_tiled_copy_dK,
+                        tdKrdK[None, rest_m, None],
+                        tdKgdK[None, rest_m, None],
+                        pred=tdKpdK[None, rest_m, None] if cutlass.const_expr(self.check_hdim_oob) else None,
+                    )
+            for rest_m in cutlass.range_constexpr(cute.size(tdVrdV.shape[1])):
+                if t0dVcdV[0, rest_m, 0][0] < seqlen.seqlen_k - n_block * self.n_block_size - tdVcdV[0][0]:
+                    cute.copy(
+                        gmem_tiled_copy_dV,
+                        tdVrdV[None, rest_m, None],
+                        tdVgdV[None, rest_m, None],
+                        pred=tdVpdV[None, rest_m, None] if cutlass.const_expr(self.check_hdim_v_oob) else None,
+                    )
+        else:  # qhead_per_kvhead > 1, do atomic add
+            # For Sm90, we need to sync to avoid racy writes to smem_q
+            # For Sm80, we don't need to sync since we're not touching smem
+            head_idx_kv = num_head // self.qhead_per_kvhead if cutlass.const_expr(not self.pack_gqa) else num_head
+            if cutlass.const_expr(not seqlen.has_cu_seqlens_k):
+                mdK_cur, mdV_cur = [t[batch_idx, head_idx_kv, None] for t in (mdK, mdV)]
+            else:
+                padded_offset_k = seqlen.offset_k + batch_idx * self.n_block_size
+                mdK_cur = cute.domain_offset((padded_offset_k * self.head_dim_padded,), mdK[head_idx_kv, None])
+                mdV_cur = cute.domain_offset((padded_offset_k * self.head_dim_v_padded,), mdV[head_idx_kv, None])
+            gdV = cute.local_tile(mdV_cur, (self.n_block_size * self.head_dim_v_padded,), (n_block,))
+            gdK = cute.local_tile(mdK_cur, (self.n_block_size * self.head_dim_padded,), (n_block,))
+            tdVgdVaccum = gmem_thr_copy_dV.partition_S(gdV)
+            tdKgdKaccum = gmem_thr_copy_dK.partition_S(gdK)
+            acc_dV_atomic = gmem_thr_copy_dV.retile(acc_dV)
+            acc_dK_atomic = gmem_thr_copy_dK.retile(acc_dK)
+            assert cute.size(acc_dV_atomic) == cute.size(tdVgdVaccum)
+            assert cute.size(acc_dK_atomic) == cute.size(tdKgdKaccum)
+            for i in cutlass.range(cute.size(acc_dV_atomic), unroll_full=True):
+                utils.atomic_add_fp32(acc_dV_atomic[i], utils.elem_pointer(tdVgdVaccum, i))
+            for i in cutlass.range(cute.size(acc_dK_atomic), unroll_full=True):
+                utils.atomic_add_fp32(acc_dK_atomic[i], utils.elem_pointer(tdKgdKaccum, i))
+    @cute.jit
+    def advance_pipeline(self, pipeline_index, num_stages: cutlass.Constexpr):
+        return pipeline_index + 1 if pipeline_index < num_stages - 1 else 0
+    @cute.jit
+    def load_K(
+        self,
+        gmem_thr_copy: cute.TiledCopy,
+        tKgK: cute.Tensor,
+        tKsK: cute.Tensor,
+        block: cutlass.Int32,
+        seqlen: cutlass.Int32,
+        headdim: cutlass.Int32,
+    ):
+        cK = cute.make_identity_tensor((self.n_block_size, self.head_dim_padded))
+        tKcK = gmem_thr_copy.partition_S(cK)
+        t0KcK = gmem_thr_copy.get_slice(0).partition_S(cK)
+        tKpK = utils.predicate_k(tKcK, limit=headdim)
+        for n in cutlass.range_constexpr(cute.size(tKsK.shape[1])):
+            # If kBlockN doesn't evenly divide the tiled copy, only the last `n` needs to be checked
+            if self.is_even_n_smem_k or n < cute.size(tKsK.shape[1]) - 1 or tKcK[0, n, 0][0] < self.n_block_size:
+                # Instead of using tKcK, we using t0KcK and subtract the offset from the limit
+                # (seqlen - block * kBlockN). This is because the entries of t0KcK are known at compile time.
+                predicate_n = t0KcK[0, n, 0][0] < seqlen - block * self.n_block_size - tKcK[0][0]
+                predicate = cute.make_fragment_like(tKpK[None, 0, None])
+                for k in cutlass.range_constexpr(cute.size(predicate.shape[1])):
+                    for i in cutlass.range_constexpr(cute.size(predicate.shape[0])):
+                        predicate[i, k] = (tKpK[i, n, k] if cutlass.const_expr(self.check_hdim_oob) else True) and predicate_n
+                cute.copy(
+                    gmem_thr_copy, tKgK[None, n, None], tKsK[None, n, None], pred=predicate,
+                )
+            # We need to clear the sK smem tiles since we'll use sKt for mma_dq
+    @cute.jit
+    def load_V(
+        self,
+        gmem_thr_copy: cute.TiledCopy,
+        tVgV: cute.Tensor,
+        tVsV: cute.Tensor,
+        block: cutlass.Int32,
+        seqlen: cutlass.Int32,
+        headdim: cutlass.Int32,
+    ):
+        cV = cute.make_identity_tensor((self.n_block_size, self.head_dim_v_padded))
+        tVcV = gmem_thr_copy.partition_S(cV)
+        t0VcV = gmem_thr_copy.get_slice(0).partition_S(cV)
+        tVpV = utils.predicate_k(tVcV, limit=headdim)
+        for n in cutlass.range_constexpr(cute.size(tVsV.shape[1])):
+            # If kBlockN doesn't evenly divide the tiled copy, only the last `n` needs to be checked
+            if self.is_even_n_smem_v or n < cute.size(tVsV.shape[1]) - 1 or tVcV[0, n, 0][0] < self.n_block_size:
+                # Instead of using tVcV, we using t0VcV and subtract the offset from the limit
+                # (seqlen - block * kBlockN). This is because the entries of t0VcV are known at compile time.
+                predicate_n = t0VcV[0, n, 0][0] < seqlen - block * self.n_block_size - tVcV[0][0]
+                predicate = cute.make_fragment_like(tVpV[None, 0, None])
+                for k in cutlass.range_constexpr(cute.size(predicate.shape[1])):
+                    for i in cutlass.range_constexpr(cute.size(predicate.shape[0])):
+                        predicate[i, k] = (tVpV[i, n, k] if cutlass.const_expr(self.check_hdim_oob) else True) and predicate_n
+                cute.copy(
+                    gmem_thr_copy, tVgV[None, n, None], tVsV[None, n, None], pred=predicate,
+                )
+    @cute.jit
+    def load_Q_LSE(
+        self,
+        gmem_tiled_copy_Q: cute.TiledCopy,
+        gmem_tiled_copy_LSE: cute.TiledCopy,
+        tQgQ: cute.Tensor,
+        tQsQ: cute.Tensor,
+        tQcQ: cute.Tensor,
+        t0QcQ: cute.Tensor,
+        tQpQ: cute.Tensor,
+        tLSEgLSE: cute.Tensor,
+        tLSEsLSE: cute.Tensor,
+        tLSEcLSE: cute.Tensor,
+        block: cutlass.Int32,
+        smem_pipe_write_q: cutlass.Int32,
+        seqlen: cutlass.Int32,
+    ):
+        for m in cutlass.range_constexpr(cute.size(tQsQ.shape[1])):
+            # If kBlockM doesn't evenly divide the tiled copy, only the last `m` needs to be checked
+            if self.is_even_m_smem_q or m < cute.size(tQsQ.shape[1]) - 1 or tQcQ[0, m, 0][0] < self.m_block_size:
+                # Instead of using tQcQ, we using t0QcQ and subtract the offset from the limit
+                # (seqlen - block * kBlockM). This is because the entries of t0QcQ are known at compile time.
+                predicate_m = t0QcQ[0, m, 0][0] < seqlen - block * self.m_block_size - tQcQ[0][0]
+                predicate = cute.make_fragment_like(tQpQ[None, 0, None])
+                for k in cutlass.range_constexpr(cute.size(predicate.shape[1])):
+                    for i in cutlass.range_constexpr(cute.size(predicate.shape[0])):
+                        predicate[i, k] = (tQpQ[i, m, k] if cutlass.const_expr(self.check_hdim_oob) else True) and predicate_m
+                cute.copy(
+                    gmem_tiled_copy_Q,
+                    tQgQ[None, m, None, block],
+                    tQsQ[None, m, None, smem_pipe_write_q if cutlass.const_expr(self.num_stages_Q) > 1 else 0],
+                    pred=predicate,
+                )
+            # We need to clear the sQ smem tiles since we'll use sQt for mma_dK
+        # We made sure LSE length is padded so we read `kBlockM` elements so that all
+        # elements in sLSE are filled. Without this we might have uninitialized sLSE values.
+        for m in cutlass.range_constexpr(cute.size(tLSEsLSE.shape[1])):
+            if tLSEcLSE[0, m][0] < self.m_block_size:
+                cute.copy(
+                    gmem_tiled_copy_LSE,
+                    tLSEgLSE[None, m, block],
+                    tLSEsLSE[None, m, smem_pipe_write_q if cutlass.const_expr(self.num_stages_Q > 1) else 0],
+                )
+    @cute.jit
+    def load_dO_dPsum(
+        self,
+        gmem_tiled_copy_dO: cute.TiledCopy,
+        gmem_tiled_copy_dPsum: cute.TiledCopy,
+        tdOgdO: cute.Tensor,
+        tdOsdO: cute.Tensor,
+        tdOcdO: cute.Tensor,
+        t0dOcdO: cute.Tensor,
+        tdOpdO: cute.Tensor,
+        tdPsumgdPsum: cute.Tensor,
+        tdPsumsdPsum: cute.Tensor,
+        tdPsumcdPsum: cute.Tensor,
+        block: cutlass.Int32,
+        smem_pipe_write_q: cutlass.Int32,
+        seqlen: cutlass.Int32,
+    ):
+        for m in cutlass.range_constexpr(cute.size(tdOsdO.shape[1])):
+            # If kBlockM doesn't evenly divide the tiled copy, only the last `m` needs to be checked
+            if self.is_even_m_smem_do or m < cute.size(tdOsdO.shape[1]) - 1 or tdOcdO[0, m, 0][0] < self.m_block_size:
+                # Instead of using tdOcdO, we using t0dOcdO and subtract the offset from the limit
+                # (seqlen - block * kBlockM). This is because the entries of t0dOcdO are known at compile time.
+                predicate_m = t0dOcdO[0, m, 0][0] < seqlen - block * self.m_block_size - tdOcdO[0][0]
+                predicate = cute.make_fragment_like(tdOpdO[None, 0, None])
+                for k in cutlass.range_constexpr(cute.size(predicate.shape[1])):
+                    for i in cutlass.range_constexpr(cute.size(predicate.shape[0])):
+                        predicate[i, k] = (tdOpdO[i, m, k] if cutlass.const_expr(self.check_hdim_oob) else True) and predicate_m
+                cute.copy(
+                    gmem_tiled_copy_dO,
+                    tdOgdO[None, m, None, block],
+                    tdOsdO[None, m, None, smem_pipe_write_q if cutlass.const_expr(self.num_stages_dO > 1) else 0],
+                    pred=predicate,
+                )
+            # We need to clear the sQ smem tiles since we'll use sQt for mma_dK
+        # We made sure LSE length is padded so we read `kBlockM` elements so that all
+        # elements in sLSE are filled. Without this we might have uninitialized sLSE values.
+        for m in cutlass.range_constexpr(cute.size(tdPsumgdPsum.shape[1])):
+            if tdPsumcdPsum[0, m][0] < self.m_block_size:
+                cute.copy(
+                    gmem_tiled_copy_dPsum,
+                    tdPsumgdPsum[None, m, block],
+                    tdPsumsdPsum[None, m, smem_pipe_write_q if cutlass.const_expr(self.num_stages_dO > 1) else 0],
+                )

build/torch-cuda/flash_bwd_postprocess.py ADDED Viewed

	@@ -0,0 +1,585 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_postprocess_kernel.h
+# from Cutlass C++ to Cute-DSL.
+import math
+from typing import Callable, Optional, Type, Literal
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.hopper_helpers as sm90_utils_basic
+import cutlass.utils.blackwell_helpers as sm100_utils_basic
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
+from cutlass import Float32, const_expr
+from cutlass.utils import LayoutEnum
+from .quack import copy_utils
+from .quack import layout_utils
+from .quack import sm90_utils
+from . import utils
+from .cute_dsl_utils import assume_tensor_aligned
+from . import ampere_helpers as sm80_utils
+from .seqlen_info import SeqlenInfoQK
+import cutlass.cute.nvgpu.tcgen05 as tcgen05
+from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import (
+    SingleTileScheduler,
+    SingleTileVarlenScheduler,
+    TileSchedulerArguments,
+)
+class FlashAttentionBackwardPostprocess:
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        head_dim: int,
+        arch: Literal[80, 90, 100],
+        tile_m: int = 128,
+        num_threads: int = 256,
+        AtomLayoutMdQ: int = 1,
+        dQ_swapAB: bool = False,
+        use_2cta_instrs: bool = False,
+        cluster_size: int = 1,  # for varlen offsets
+    ):
+        """
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param tile_m: m block size
+        :type tile_m: int
+        """
+        self.dtype = dtype
+        self.tile_m = tile_m
+        assert arch // 10 in [8, 9, 10, 11], (
+            "Only Ampere (8.x), Hopper (9.x), and Blackwell (10.x, 11.x) are supported"
+        )
+        self.arch = arch
+        # padding head_dim to a multiple of 32 as k_block_size
+        hdim_multiple_of = 32
+        self.tile_hdim = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
+        self.check_hdim_oob = head_dim != self.tile_hdim
+        self.num_threads = num_threads
+        self.AtomLayoutMdQ = AtomLayoutMdQ
+        self.dQ_swapAB = dQ_swapAB
+        self.use_2cta_instrs = use_2cta_instrs and arch == 100 and head_dim != 64
+        self.cluster_size = cluster_size
+    @staticmethod
+    def can_implement(dtype, head_dim, tile_m, num_threads) -> bool:
+        """Check if the kernel can be implemented with the given parameters.
+        :param dtype: data type
+        :type dtype: cutlass.Numeric
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param tile_m: m block size
+        :type tile_m: int
+        :return: True if the kernel can be implemented, False otherwise
+        :rtype: bool
+        """
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        return True
+    def _get_tiled_mma(self):
+        if const_expr(self.arch == 80):
+            num_mma_warps = self.num_threads // 32
+            atom_layout_dQ = (
+                (self.AtomLayoutMdQ, num_mma_warps // self.AtomLayoutMdQ, 1)
+                if const_expr(not self.dQ_swapAB)
+                else (num_mma_warps // self.AtomLayoutMdQ, self.AtomLayoutMdQ, 1)
+            )
+            tiled_mma = cute.make_tiled_mma(
+                warp.MmaF16BF16Op(self.dtype, Float32, (16, 8, 16)),
+                atom_layout_dQ,
+                permutation_mnk=(atom_layout_dQ[0] * 16, atom_layout_dQ[1] * 16, 16),
+            )
+        elif const_expr(self.arch == 90):
+            num_mma_warp_groups = self.num_threads // 128
+            atom_layout_dQ = (self.AtomLayoutMdQ, num_mma_warp_groups // self.AtomLayoutMdQ)
+            tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
+            tiled_mma = sm90_utils_basic.make_trivial_tiled_mma(
+                self.dtype,
+                self.dtype,
+                warpgroup.OperandMajorMode.K,  # These don't matter, we only care about the accum
+                warpgroup.OperandMajorMode.K,
+                Float32,
+                atom_layout_mnk=(atom_layout_dQ if not self.dQ_swapAB else atom_layout_dQ[::-1])
+                + (1,),
+                tiler_mn=tiler_mn_dQ if not self.dQ_swapAB else tiler_mn_dQ[::-1],
+            )
+        else:
+            cta_group = tcgen05.CtaGroup.ONE
+            tiled_mma = sm100_utils_basic.make_trivial_tiled_mma(
+                self.dtype,
+                tcgen05.OperandMajorMode.MN,  # dS_major_mode
+                tcgen05.OperandMajorMode.MN,  # Kt_major_mode
+                Float32,
+                cta_group,
+                (self.tile_m, self.tile_hdim),
+            )
+        if const_expr(self.arch in [80, 90]):
+            assert self.num_threads == tiled_mma.size
+        return tiled_mma
+    def _setup_attributes(self):
+        # ///////////////////////////////////////////////////////////////////////////////
+        # GMEM Tiled copy:
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Thread layouts for copies
+        universal_copy_bits = 128
+        async_copy_elems_accum = universal_copy_bits // Float32.width
+        atom_async_copy_accum = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            Float32,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        # We don't do bound checking for the gmem -> smem load so we just assert here.
+        assert (self.tile_m * self.tile_hdim // async_copy_elems_accum) % self.num_threads == 0
+        self.g2s_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
+            atom_async_copy_accum,
+            cute.make_layout(self.num_threads),
+            cute.make_layout(async_copy_elems_accum),
+        )
+        num_s2r_copy_elems = 1 if const_expr(self.arch == 80) else 4
+        if const_expr(self.arch == 80):
+            self.s2r_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
+                Float32, self.num_threads, num_s2r_copy_elems
+            )
+            self.sdQaccum_layout = cute.make_layout(self.tile_m * self.tile_hdim)
+        elif const_expr(self.arch == 90):
+            num_threads_per_warp_group = 128
+            num_mma_warp_groups = self.num_threads // 128
+            self.s2r_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
+                cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
+                cute.make_layout((num_threads_per_warp_group, num_mma_warp_groups)),  # thr_layout
+                cute.make_layout(128 // Float32.width),  # val_layout
+            )
+            self.sdQaccum_layout = cute.make_layout(
+                (self.tile_m * self.tile_hdim // num_mma_warp_groups, num_mma_warp_groups)
+            )
+        else:
+            self.dQ_reduce_ncol = 32
+            dQaccum_reduce_stage = self.tile_hdim // self.dQ_reduce_ncol
+            assert self.num_threads == 128  # TODO: currently hard-coded
+            self.s2r_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
+                Float32, self.num_threads, num_s2r_copy_elems
+            )
+            self.sdQaccum_layout = cute.make_layout(
+                (self.tile_m * self.tile_hdim // dQaccum_reduce_stage, dQaccum_reduce_stage)
+            )
+        num_copy_elems = 128 // self.dtype.width
+        threads_per_row = math.gcd(128, self.tile_hdim) // num_copy_elems
+        self.gmem_tiled_copy_dQ = copy_utils.tiled_copy_2d(
+            self.dtype, threads_per_row, self.num_threads, num_copy_elems
+        )
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory layout: dQ
+        # ///////////////////////////////////////////////////////////////////////////////
+        # We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split across 2 WGs,
+        # then setting kBlockKSmem to 32 will cause "Static shape_div failure".
+        # We want to treat it as 64 x 48, so kBlockKSmem should be 16.
+        mma_shape_n = self.tiled_mma.get_tile_size(1)
+        if const_expr(self.arch == 80):
+            sdQ_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, mma_shape_n)
+            self.sdQ_layout = cute.tile_to_shape(
+                sdQ_layout_atom, (self.tile_m, self.tile_hdim), (0, 1)
+            )
+        elif const_expr(self.arch == 90):
+            self.sdQ_layout = sm90_utils.make_smem_layout(
+                self.dtype, LayoutEnum.ROW_MAJOR, (self.tile_m, self.tile_hdim)
+            )
+        else:
+            # TODO: this is hard-coded for hdim 128
+            self.sdQ_layout = sm100_utils_basic.make_smem_layout_epi(
+                self.dtype, LayoutEnum.ROW_MAJOR, (self.tile_m, self.tile_hdim), 1
+            )
+    @cute.jit
+    def __call__(
+        self,
+        mdQaccum: cute.Tensor,
+        mdQ: cute.Tensor,
+        scale: cutlass.Float32,
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+    ):
+        # Get the data type and check if it is fp16 or bf16
+        if const_expr(mdQ.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+            raise TypeError("Only Float16 or BFloat16 is supported")
+        if const_expr(mdQaccum is not None):
+            if const_expr(mdQaccum.element_type not in [cutlass.Float32]):
+                raise TypeError("dQaccum tensor must be Float32")
+        mdQaccum, mdQ = [assume_tensor_aligned(t) for t in (mdQaccum, mdQ)]
+        self.tiled_mma = self._get_tiled_mma()
+        self._setup_attributes()
+        smem_size = max(
+            cute.size_in_bytes(cutlass.Float32, self.sdQaccum_layout),
+            cute.size_in_bytes(self.dtype, self.sdQ_layout),
+        )
+        if const_expr(mCuSeqlensQ is not None):
+            TileScheduler = SingleTileVarlenScheduler
+            num_head = mdQ.shape[1]
+            num_batch = mCuSeqlensQ.shape[0] - 1
+            num_block = cute.ceil_div(mdQ.shape[0], self.tile_m)
+        else:
+            TileScheduler = SingleTileScheduler
+            num_head = mdQ.shape[2]
+            num_batch = mdQ.shape[0]
+            num_block = cute.ceil_div(mdQ.shape[1], self.tile_m)
+        tile_sched_args = TileSchedulerArguments(
+            num_block=num_block,
+            num_head=num_head,
+            num_batch=num_batch,
+            num_splits=1,
+            seqlen_k=0,
+            headdim=mdQ.shape[2],
+            headdim_v=0,
+            total_q=mdQ.shape[0],
+            tile_shape_mn=(self.tile_m, 1),
+            mCuSeqlensQ=mCuSeqlensQ,
+            mSeqUsedQ=mSeqUsedQ,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        # grid_dim: (m_block, num_head, batch_size)
+        self.kernel(
+            mdQaccum,
+            mdQ,
+            mCuSeqlensQ,
+            mSeqUsedQ,
+            scale,
+            self.tiled_mma,
+            self.dQ_swapAB,
+            self.sdQaccum_layout,
+            self.sdQ_layout,
+            self.g2s_tiled_copy_dQaccum,
+            self.s2r_tiled_copy_dQaccum,
+            self.gmem_tiled_copy_dQ,
+            tile_sched_params,
+            TileScheduler,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            smem=smem_size,
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mdQaccum: cute.Tensor,
+        mdQ: cute.Tensor,
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        scale: cutlass.Float32,
+        tiled_mma: cute.TiledMma,
+        dQ_swapAB: cutlass.Constexpr,
+        sdQaccum_layout: cute.Layout,
+        sdQ_layout: cute.ComposedLayout,
+        g2s_tiled_copy_dQaccum: cute.TiledCopy,
+        s2r_tiled_copy_dQaccum: cute.TiledCopy,
+        gmem_tiled_copy_dQ: cute.TiledCopy,
+        tile_sched_params: ParamsBase,
+        TileScheduler: cutlass.Constexpr[Callable],
+    ):
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get shared memory buffer
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem = cutlass.utils.SmemAllocator()
+        sdQaccum = smem.allocate_tensor(cutlass.Float32, sdQaccum_layout, byte_alignment=1024)
+        sdQaccum_flat = cute.make_tensor(sdQaccum.iterator, cute.make_layout(cute.size(sdQaccum)))
+        if const_expr(self.arch in [80, 90]):
+            sdQ = cute.make_tensor(cute.recast_ptr(sdQaccum.iterator, dtype=self.dtype), sdQ_layout)
+        else:
+            # extra stage dimension
+            sdQ = cute.make_tensor(
+                cute.recast_ptr(sdQaccum.iterator, sdQ_layout.inner, dtype=self.dtype),
+                sdQ_layout.outer,
+            )[None, None, 0]
+        sdQt = layout_utils.transpose_view(sdQ)
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        tile_scheduler = TileScheduler.create(tile_sched_params)
+        work_tile = tile_scheduler.initial_work_tile_info()
+        m_block, head_idx, batch_idx, _ = work_tile.tile_idx
+        if work_tile.is_valid_tile:
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Get the appropriate tiles for this thread block.
+            # ///////////////////////////////////////////////////////////////////////////////
+            seqlen = SeqlenInfoQK.create(
+                batch_idx,
+                mdQ.shape[1],
+                0,
+                mCuSeqlensQ=mCuSeqlensQ,
+                mCuSeqlensK=None,
+                mSeqUsedQ=mSeqUsedQ,
+                mSeqUsedK=None,
+                tile_m=self.tile_m * self.cluster_size,
+            )
+            if const_expr(not seqlen.has_cu_seqlens_q):
+                mdQ_cur = mdQ[batch_idx, None, head_idx, None]
+                mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
+                head_dim = mdQ.shape[3]
+            else:
+                if cutlass.const_expr(self.arch >= 90):
+                    padded_offset_q = seqlen.padded_offset_q
+                else:
+                    padded_offset_q = seqlen.offset_q + batch_idx * self.tile_m
+                mdQ_cur = cute.domain_offset((seqlen.offset_q, 0), mdQ[None, head_idx, None])
+                mdQaccum_cur = cute.domain_offset(
+                    (padded_offset_q * self.tile_hdim,), mdQaccum[head_idx, None]
+                )
+                head_dim = mdQ.shape[2]
+                # HACK: Compiler doesn't seem to recognize that padding
+                # by padded_offset_q * self.tile_hdim keeps alignment
+                # since statically divisible by 4
+                mdQaccum_cur_ptr = cute.make_ptr(
+                    dtype=mdQaccum_cur.element_type,
+                    value=mdQaccum_cur.iterator.toint(),
+                    mem_space=mdQaccum_cur.iterator.memspace,
+                    assumed_align=mdQaccum.iterator.alignment,
+                )
+                mdQaccum_cur = cute.make_tensor(mdQaccum_cur_ptr, mdQaccum_cur.layout)
+            gdQaccum = cute.local_tile(mdQaccum_cur, (self.tile_m * self.tile_hdim,), (m_block,))
+            gdQ = cute.local_tile(mdQ_cur, (self.tile_m, self.tile_hdim), (m_block, 0))
+            seqlen_q = seqlen.seqlen_q
+            seqlen_q_rounded = cute.round_up(seqlen_q, self.tile_m)
+            if const_expr(self.arch == 100 and self.use_2cta_instrs):
+                # 2-CTA: remap dQaccum layout into TMEM view before writing sdQ
+                num_reduce_threads = self.num_threads
+                thr_mma_dsk = tiled_mma.get_slice(tidx)
+                dQacc_shape = thr_mma_dsk.partition_shape_C((self.tile_m, self.tile_hdim))
+                tdQtdQ = thr_mma_dsk.make_fragment_C(dQacc_shape)
+                tdQtdQ = cute.make_tensor(tdQtdQ.iterator, tdQtdQ.layout)
+                tmem_load_atom = cute.make_copy_atom(
+                    tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(self.dQ_reduce_ncol)), Float32
+                )
+                tiled_tmem_ld = tcgen05.make_tmem_copy(tmem_load_atom, tdQtdQ)
+                thr_tmem_ld = tiled_tmem_ld.get_slice(tidx)
+                cdQ = cute.make_identity_tensor((self.tile_m, self.tile_hdim))
+                tdQcdQ = thr_mma_dsk.partition_C(cdQ)
+                tdQcdQ_tensor = cute.make_tensor(tdQcdQ.iterator, tdQcdQ.layout)
+                tdQrdQ = thr_tmem_ld.partition_D(tdQcdQ_tensor)
+                tiled_copy_accum = s2r_tiled_copy_dQaccum
+                g2s_thr_copy = tiled_copy_accum.get_slice(tidx)
+                # S -> R
+                tdQrdQ_fp32 = cute.make_fragment(tdQrdQ.shape, cutlass.Float32)
+                tdQrdQ_s2r = cute.make_tensor(tdQrdQ_fp32.iterator, tdQrdQ_fp32.shape)
+                smem_copy_atom = sm100_utils_basic.get_smem_store_op(
+                    LayoutEnum.ROW_MAJOR, self.dtype, cutlass.Float32, tiled_tmem_ld
+                )
+                r2s_tiled_copy = cute.make_tiled_copy(
+                    smem_copy_atom,
+                    layout_tv=tiled_tmem_ld.layout_dst_tv_tiled,
+                    tiler_mn=tiled_tmem_ld.tiler_mn,
+                )
+                tdQsdQ_r2s = thr_tmem_ld.partition_D(thr_mma_dsk.partition_C(sdQ))
+                tdQrdQ_r2s = cute.make_fragment(tdQsdQ_r2s.shape, self.dtype)
+                num_stages = cute.size(tdQrdQ_fp32, mode=[1])
+                stage_stride = self.dQ_reduce_ncol
+                row_groups = 2
+                assert num_stages % row_groups == 0
+                assert num_reduce_threads % row_groups == 0
+                stage_groups = num_stages // row_groups
+                threads_per_row_group = num_reduce_threads // row_groups
+                stage_loads = tuple((row_group, row_group) for row_group in range(row_groups))
+                stage_iters = tuple(
+                    (row_group, row_group * threads_per_row_group)
+                    for row_group in range(row_groups)
+                )
+                s2r_lane = tidx % threads_per_row_group
+                s2r_buf = tidx // threads_per_row_group
+                gdQaccum_layout_g2s = cute.make_layout(
+                    shape=(self.tile_m * self.dQ_reduce_ncol, 1), stride=(1, 0)
+                )
+                sdQaccum_g2s = g2s_thr_copy.partition_D(sdQaccum)
+                # G -> S
+                for stage_group in cutlass.range_constexpr(stage_groups):
+                    for stage_offset, smem_buf in stage_loads:
+                        stage_idx = stage_group + stage_offset * stage_groups
+                        gdQaccum_stage = cute.local_tile(
+                            gdQaccum,
+                            (self.tile_m * self.dQ_reduce_ncol,),
+                            (stage_idx,),
+                        )
+                        gdQaccum_stage_g2s = cute.make_tensor(
+                            gdQaccum_stage.iterator,
+                            gdQaccum_layout_g2s,
+                        )
+                        tdQgdQ = g2s_thr_copy.partition_S(gdQaccum_stage_g2s)
+                        cute.copy(
+                            g2s_thr_copy,
+                            tdQgdQ[None, None, 0],
+                            sdQaccum_g2s[None, None, smem_buf],
+                        )
+                    cute.arch.fence_view_async_shared()
+                    cute.arch.barrier(barrier_id=6, number_of_threads=num_reduce_threads)
+                    # S -> R
+                    for stage_offset, lane_offset in stage_iters:
+                        stage_idx = stage_group + stage_offset * stage_groups
+                        s2r_src_tidx = s2r_lane + lane_offset
+                        s2r_thr_copy = tiled_copy_accum.get_slice(s2r_src_tidx)
+                        sdQaccum_src = s2r_thr_copy.partition_S(sdQaccum)[None, None, s2r_buf]
+                        tdQrdQ_s2r_cpy = tdQrdQ_s2r[None, stage_idx, None, None]
+                        tdQrdQ_r2s_cpy = cute.make_tensor(
+                            tdQrdQ_s2r_cpy.iterator, cute.make_layout(sdQaccum_src.shape)
+                        )
+                        cute.copy(s2r_thr_copy, sdQaccum_src, tdQrdQ_r2s_cpy)
+                        cute.arch.fence_view_async_shared()
+                        cute.arch.barrier(barrier_id=7, number_of_threads=num_reduce_threads)
+                        # R -> S
+                        stage_lo = stage_idx % stage_stride
+                        stage_hi = stage_idx // stage_stride
+                        tdQrdQ_r2s_cpy = cute.make_tensor(
+                            cute.recast_ptr(tdQrdQ_r2s_cpy.iterator),
+                            tdQrdQ_r2s[((None, 0), (stage_lo, stage_hi), 0, 0)].shape,
+                        )
+                        dQ_vec = tdQrdQ_r2s_cpy.load() * scale
+                        tdQrdQ_r2s[((None, 0), (stage_lo, stage_hi), 0, 0)].store(
+                            dQ_vec.to(self.dtype)
+                        )
+                # R -> S
+                cute.copy(
+                    r2s_tiled_copy,
+                    tdQrdQ_r2s[None, None, None, 0],
+                    tdQsdQ_r2s[None, None, None, 0],
+                )
+                cute.arch.fence_view_async_shared()
+                cute.arch.barrier(barrier_id=8, number_of_threads=num_reduce_threads)
+            else:
+                # Step 1: load dQaccum from gmem to smem
+                g2s_thr_copy_dQaccum = g2s_tiled_copy_dQaccum.get_slice(tidx)
+                tdQgdQaccum = g2s_thr_copy_dQaccum.partition_S(gdQaccum)
+                tdQsdQaccumg2s = g2s_thr_copy_dQaccum.partition_D(sdQaccum_flat)
+                cute.copy(g2s_tiled_copy_dQaccum, tdQgdQaccum, tdQsdQaccumg2s)
+                cute.arch.cp_async_commit_group()
+                cute.arch.cp_async_wait_group(0)
+                cute.arch.barrier()
+                # Step 2: load dQ from smem to rmem
+                s2r_thr_copy_dQaccum = s2r_tiled_copy_dQaccum.get_slice(tidx)
+                tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum)
+                tile_shape = (self.tile_m, self.tile_hdim)
+                acc = None
+                tiled_copy_t2r = None
+                if const_expr(self.arch in [80, 90]):
+                    acc_shape = tiled_mma.partition_shape_C(
+                        tile_shape if const_expr(not dQ_swapAB) else tile_shape[::-1]
+                    )
+                    acc = cute.make_fragment(acc_shape, cutlass.Float32)
+                    assert cute.size(acc) == cute.size(tdQsdQaccum)
+                else:
+                    thr_mma = tiled_mma.get_slice(0)  # 1-CTA
+                    dQacc_shape = tiled_mma.partition_shape_C((self.tile_m, self.tile_hdim))
+                    tdQtdQ = tiled_mma.make_fragment_C(dQacc_shape)
+                    tdQcdQ = thr_mma.partition_C(
+                        cute.make_identity_tensor((self.tile_m, self.tile_hdim))
+                    )
+                    tmem_load_atom = cute.make_copy_atom(
+                        tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(self.dQ_reduce_ncol)),
+                        Float32,
+                    )
+                    tiled_copy_t2r = tcgen05.make_tmem_copy(tmem_load_atom, tdQtdQ)
+                    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                    tdQrdQ_t2r_shape = thr_copy_t2r.partition_D(tdQcdQ).shape
+                    acc = cute.make_fragment(tdQrdQ_t2r_shape, Float32)
+                tdQrdQaccum = cute.make_tensor(acc.iterator, cute.make_layout(tdQsdQaccum.shape))
+                cute.autovec_copy(tdQsdQaccum, tdQrdQaccum)
+                # Convert tdQrdQaccum from fp32 to fp16/bf16
+                rdQ = cute.make_fragment_like(acc, self.dtype)
+                rdQ.store((acc.load() * scale).to(self.dtype))
+                # Step 3: Copy dQ from register to smem
+                cute.arch.barrier()  # make sure all threads have finished loading dQaccum
+                if const_expr(self.arch in [80, 90]):
+                    copy_atom_r2s_dQ = utils.get_smem_store_atom(
+                        self.arch, self.dtype, transpose=self.dQ_swapAB
+                    )
+                    tiled_copy_r2s_dQ = cute.make_tiled_copy_C(copy_atom_r2s_dQ, tiled_mma)
+                else:
+                    # copy_atom_r2s_dQ = sm100_utils_basic.get_smem_store_op(
+                    #     LayoutEnum.ROW_MAJOR, self.dtype, Float32, tiled_copy_t2r,
+                    # )
+                    # tiled_copy_r2s_dQ = cute.make_tiled_copy_D(copy_atom_r2s_dQ, tiled_copy_t2r)
+                    thr_layout_r2s_dQ = cute.make_layout((self.num_threads, 1))  # 128 threads
+                    val_layout_r2s_dQ = cute.make_layout((1, 128 // self.dtype.width))
+                    copy_atom_r2s_dQ = cute.make_copy_atom(
+                        cute.nvgpu.CopyUniversalOp(),
+                        self.dtype,
+                        num_bits_per_copy=128,
+                    )
+                    tiled_copy_r2s_dQ = cute.make_tiled_copy_tv(
+                        copy_atom_r2s_dQ, thr_layout_r2s_dQ, val_layout_r2s_dQ
+                    )
+                thr_copy_r2s_dQ = tiled_copy_r2s_dQ.get_slice(tidx)
+                cdQ = cute.make_identity_tensor((self.tile_m, self.tile_hdim))
+                if const_expr(self.arch in [80, 90]):
+                    taccdQrdQ = thr_copy_r2s_dQ.retile(rdQ)
+                else:
+                    taccdQcdQ_shape = thr_copy_r2s_dQ.partition_S(cdQ).shape
+                    taccdQrdQ = cute.make_tensor(rdQ.iterator, taccdQcdQ_shape)
+                taccdQsdQ = thr_copy_r2s_dQ.partition_D(
+                    sdQ if const_expr(not self.dQ_swapAB) else sdQt
+                )
+                cute.copy(thr_copy_r2s_dQ, taccdQrdQ, taccdQsdQ)
+            # Step 4: Copy dQ from smem to register to prepare for coalesced write to gmem
+            cute.arch.barrier()  # make sure all smem stores are done
+            gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_slice(tidx)
+            tdQgdQ = gmem_thr_copy_dQ.partition_S(gdQ)
+            tdQsdQ = gmem_thr_copy_dQ.partition_D(sdQ)
+            tdQrdQ = cute.make_fragment_like(tdQsdQ, self.dtype)
+            # TODO: check OOB when reading from smem if kBlockM isn't evenly tiled
+            cute.autovec_copy(tdQsdQ, tdQrdQ)
+            # Step 5: Copy dQ from register to gmem
+            tdQcdQ = gmem_thr_copy_dQ.partition_S(cdQ)
+            tdQpdQ = utils.predicate_k(tdQcdQ, limit=head_dim)
+            for rest_m in cutlass.range(cute.size(tdQrdQ.shape[1]), unroll_full=True):
+                if tdQcdQ[0, rest_m, 0][0] < seqlen_q - m_block * self.tile_m:
+                    cute.copy(
+                        gmem_tiled_copy_dQ,
+                        tdQrdQ[None, rest_m, None],
+                        tdQgdQ[None, rest_m, None],
+                        pred=tdQpdQ[None, rest_m, None],
+                    )

build/torch-cuda/flash_bwd_preprocess.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_preprocess_kernel.h
+# from Cutlass C++ to Cute-DSL.
+import math
+import operator
+from typing import Callable, Type, Optional, Literal
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32
+from .quack import copy_utils
+from . import utils
+from .cute_dsl_utils import assume_tensor_aligned
+from .seqlen_info import SeqlenInfoQK
+from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import (
+    SingleTileScheduler,
+    SingleTileVarlenScheduler,
+    TileSchedulerArguments,
+)
+class FlashAttentionBackwardPreprocess:
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        head_dim: int,
+        head_dim_v: int,
+        arch: Literal[80, 90, 100],
+        m_block_size: int = 128,
+        num_threads: int = 128,
+    ):
+        """
+        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
+        should be a multiple of 8.
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        """
+        self.dtype = dtype
+        self.m_block_size = m_block_size
+        self.arch = arch
+        # padding head_dim to a multiple of 32 as k_block_size
+        hdim_multiple_of = 32
+        self.head_dim_padded = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
+        self.head_dim_v_padded = int(math.ceil(head_dim_v / hdim_multiple_of) * hdim_multiple_of)
+        self.check_hdim_v_oob = head_dim_v != self.head_dim_v_padded
+        self.num_threads = num_threads
+    @staticmethod
+    def can_implement(dtype, head_dim, m_block_size, num_threads) -> bool:
+        """Check if the kernel can be implemented with the given parameters.
+        :param dtype: data type
+        :type dtype: cutlass.Numeric
+        :param head_dim: head dimension
+        :type head_dim: int
+        :param m_block_size: m block size
+        :type m_block_size: int
+        :param num_threads: number of threads
+        :type num_threads: int
+        :return: True if the kernel can be implemented, False otherwise
+        :rtype: bool
+        """
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        if num_threads < m_block_size:  # For multiplying lse with log2
+            return False
+        return True
+    def _setup_attributes(self):
+        # ///////////////////////////////////////////////////////////////////////////////
+        # GMEM Tiled copy:
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Thread layouts for copies
+        # We want kBlockKGmem to be a power of 2 so that when we do the summing,
+        # it's just between threads in the same warp
+        gmem_k_block_size = (
+            128
+            if self.head_dim_v_padded % 128 == 0
+            else (
+                64
+                if self.head_dim_v_padded % 64 == 0
+                else (32 if self.head_dim_v_padded % 32 == 0 else 16)
+            )
+        )
+        num_copy_elems = 128 // self.dtype.width
+        threads_per_row = gmem_k_block_size // num_copy_elems
+        self.gmem_tiled_copy_O = copy_utils.tiled_copy_2d(
+            self.dtype, threads_per_row, self.num_threads, num_copy_elems
+        )
+        universal_copy_bits = 128
+        num_copy_elems_dQaccum = universal_copy_bits // Float32.width
+        assert (
+            self.m_block_size * self.head_dim_padded // num_copy_elems_dQaccum
+        ) % self.num_threads == 0
+        self.gmem_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
+            Float32, self.num_threads, num_copy_elems_dQaccum
+        )
+    @cute.jit
+    def __call__(
+        self,
+        mO: cute.Tensor,
+        mdO: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mLSE: Optional[cute.Tensor],
+        mLSElog2: Optional[cute.Tensor],
+        mdQaccum: Optional[cute.Tensor],
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+    ):
+        # Get the data type and check if it is fp16 or bf16
+        if cutlass.const_expr(not (mO.element_type == mdO.element_type)):
+            raise TypeError("All tensors must have the same data type")
+        if cutlass.const_expr(mO.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+            raise TypeError("Only Float16 or BFloat16 is supported")
+        if cutlass.const_expr(mdPsum.element_type not in [Float32]):
+            raise TypeError("dPsum tensor must be Float32")
+        if cutlass.const_expr(mdQaccum is not None):
+            if cutlass.const_expr(mdQaccum.element_type not in [Float32]):
+                raise TypeError("dQaccum tensor must be Float32")
+        if cutlass.const_expr(mLSE is not None):
+            assert mLSElog2 is not None, "If mLSE is provided, mLSElog2 must also be provided"
+            if cutlass.const_expr(mLSE.element_type not in [Float32]):
+                raise TypeError("LSE tensor must be Float32")
+            if cutlass.const_expr(mLSElog2.element_type not in [Float32]):
+                raise TypeError("LSElog2 tensor must be Float32")
+        mO, mdO, mdQaccum = [assume_tensor_aligned(t) for t in (mO, mdO, mdQaccum)]
+        self._setup_attributes()
+        if cutlass.const_expr(mCuSeqlensQ is not None):
+            TileScheduler = SingleTileVarlenScheduler
+            num_head = mO.shape[1]
+            num_batch = mCuSeqlensQ.shape[0] - 1
+        else:
+            TileScheduler = SingleTileScheduler
+            num_head = mO.shape[2]
+            num_batch = mO.shape[0]
+        tile_sched_args = TileSchedulerArguments(
+            num_block=cute.ceil_div(mO.shape[1], self.m_block_size),
+            num_head=num_head,
+            num_batch=num_batch,
+            num_splits=1,
+            seqlen_k=0,
+            headdim=0,
+            headdim_v=mO.shape[2],
+            total_q=mO.shape[0],
+            tile_shape_mn=(self.m_block_size, 1),
+            mCuSeqlensQ=mCuSeqlensQ,
+            mSeqUsedQ=mSeqUsedQ,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        self.kernel(
+            mO,
+            mdO,
+            mdPsum,
+            mLSE,
+            mLSElog2,
+            mdQaccum,
+            mCuSeqlensQ,
+            mSeqUsedQ,
+            self.gmem_tiled_copy_O,
+            self.gmem_tiled_copy_dQaccum,
+            tile_sched_params,
+            TileScheduler,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mO: cute.Tensor,
+        mdO: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mLSE: Optional[cute.Tensor],
+        mLSElog2: Optional[cute.Tensor],
+        mdQaccum: Optional[cute.Tensor],
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        gmem_tiled_copy_O: cute.TiledCopy,
+        gmem_tiled_copy_dQaccum: cute.TiledCopy,
+        tile_sched_params: ParamsBase,
+        TileScheduler: cutlass.Constexpr[Callable],
+    ):
+        # Thread index, block index
+        tidx, _, _ = cute.arch.thread_idx()
+        tile_scheduler = TileScheduler.create(tile_sched_params)
+        work_tile = tile_scheduler.initial_work_tile_info()
+        m_block, head_idx, batch_idx, _ = work_tile.tile_idx
+        if work_tile.is_valid_tile:
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Get the appropriate tiles for this thread block.
+            # ///////////////////////////////////////////////////////////////////////////////
+            seqlen = SeqlenInfoQK.create(
+                batch_idx,
+                mO.shape[1],
+                0,
+                mCuSeqlensQ=mCuSeqlensQ,
+                mCuSeqlensK=None,
+                mSeqUsedQ=mSeqUsedQ,
+                mSeqUsedK=None,
+            )
+            if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
+                mO_cur = mO[batch_idx, None, head_idx, None]
+                mdO_cur = mdO[batch_idx, None, head_idx, None]
+                mdPsum_cur = mdPsum[batch_idx, head_idx, None]
+                headdim_v = mO.shape[3]
+            else:
+                mO_cur = cute.domain_offset((seqlen.offset_q, 0), mO[None, head_idx, None])
+                mdO_cur = cute.domain_offset((seqlen.offset_q, 0), mdO[None, head_idx, None])
+                padded_offset_q = seqlen.offset_q + batch_idx * self.m_block_size
+                if cutlass.const_expr(self.arch >= 90):
+                    padded_offset_q = padded_offset_q // self.m_block_size * self.m_block_size
+                mdPsum_cur = cute.domain_offset((padded_offset_q,), mdPsum[head_idx, None])
+                headdim_v = mO.shape[2]
+            blkOdO_shape = (self.m_block_size, self.head_dim_v_padded)
+            # (m_block_size, head_dim_v)
+            gO = cute.local_tile(mO_cur, blkOdO_shape, (m_block, 0))
+            gdO = cute.local_tile(mdO_cur, blkOdO_shape, (m_block, 0))
+            gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
+            # (CPY_Atom, CPY_M, CPY_K)
+            tOgO = gmem_thr_copy_O.partition_S(gO)
+            tOgdO = gmem_thr_copy_O.partition_S(gdO)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
+            # of tile_shape
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Construct identity layout for KV
+            cO = cute.make_identity_tensor((self.m_block_size, self.head_dim_v_padded))
+            tOcO = gmem_thr_copy_O.partition_S(cO)
+            t0OcO = gmem_thr_copy_O.get_slice(0).partition_S(cO)
+            tOpO = utils.predicate_k(tOcO, limit=headdim_v)
+            tOpdO = utils.predicate_k(tOcO, limit=headdim_v)
+            seqlen_q = seqlen.seqlen_q
+            seqlen_q_rounded = cute.round_up(seqlen_q, self.m_block_size)
+            if cutlass.const_expr(mLSE is not None):
+                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
+                    mLSE_cur = mLSE[batch_idx, head_idx, None]
+                else:
+                    mLSE_cur = cute.domain_offset((seqlen.offset_q,), mLSE[head_idx, None])
+                gLSE = cute.local_tile(mLSE_cur, (self.m_block_size,), (m_block,))
+                lse = Float32.inf
+                if tidx < seqlen_q - m_block * self.m_block_size:
+                    lse = gLSE[tidx]
+            tOrO = cute.make_fragment_like(tOgO)
+            tOrdO = cute.make_fragment_like(tOgdO)
+            assert cute.size(tOgO, mode=[0]) == cute.size(tOgdO, mode=[0])
+            assert cute.size(tOgO, mode=[1]) == cute.size(tOgdO, mode=[1])
+            assert cute.size(tOgO, mode=[2]) == cute.size(tOgdO, mode=[2])
+            for m in cutlass.range(cute.size(tOrO.shape[1]), unroll_full=True):
+                # Instead of using tOcO, we using t0OcO and subtract the offset from the limit
+                # (seqlen_q - m_block * kBlockM). This is because the entries of t0OcO are known at compile time.
+                if t0OcO[0, m, 0][0] < seqlen_q - m_block * self.m_block_size - tOcO[0][0]:
+                    cute.copy(
+                        gmem_thr_copy_O,
+                        tOgO[None, m, None],
+                        tOrO[None, m, None],
+                        pred=tOpO[None, m, None]
+                        if cutlass.const_expr(self.check_hdim_v_oob)
+                        else None,
+                    )
+                    cute.copy(
+                        gmem_thr_copy_O,
+                        tOgdO[None, m, None],
+                        tOrdO[None, m, None],
+                        pred=tOpdO[None, m, None]
+                        if cutlass.const_expr(self.check_hdim_v_oob)
+                        else None,
+                    )
+            # Sum across the "k" dimension
+            dpsum = (tOrO.load().to(Float32) * tOrdO.load().to(Float32)).reduce(
+                cute.ReductionOp.ADD, init_val=0.0, reduction_profile=(0, None, 1)
+            )
+            threads_per_row = gmem_tiled_copy_O.layout_src_tv_tiled[0].shape[0]
+            assert cute.arch.WARP_SIZE % threads_per_row == 0
+            dpsum = utils.warp_reduce(dpsum, operator.add, width=threads_per_row)
+            dP_sum = cute.make_fragment(cute.size(tOrO, mode=[1]), Float32)
+            dP_sum.store(dpsum)
+            # Write dPsum from rmem -> gmem
+            gdPsum = cute.local_tile(mdPsum_cur, (self.m_block_size,), (m_block,))
+            # Only the thread corresponding to column 0 writes out the dPsum to gmem
+            if tOcO[0, 0, 0][1] == 0:
+                for m in cutlass.range(cute.size(dP_sum), unroll_full=True):
+                    row = tOcO[0, m, 0][0]
+                    gdPsum[row] = dP_sum[m] if row < seqlen_q - m_block * self.m_block_size else 0.0
+            # Clear dQaccum
+            if cutlass.const_expr(mdQaccum is not None):
+                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
+                    mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
+                else:
+                    mdQaccum_cur = cute.domain_offset(
+                        (padded_offset_q * self.head_dim_padded,), mdQaccum[head_idx, None]
+                    )
+                    # HACK: Compiler doesn't seem to recognize that padding
+                    # by padded_offset_q * self.head_dim_padded keeps alignment
+                    # since statically divisible by 4
+                    mdQaccum_cur_ptr = cute.make_ptr(
+                        dtype=mdQaccum_cur.element_type,
+                        value=mdQaccum_cur.iterator.toint(),
+                        mem_space=mdQaccum_cur.iterator.memspace,
+                        assumed_align=mdQaccum.iterator.alignment,
+                    )
+                    mdQaccum_cur = cute.make_tensor(mdQaccum_cur_ptr, mdQaccum_cur.layout)
+                blkdQaccum_shape = (self.m_block_size * self.head_dim_padded,)
+                gdQaccum = cute.local_tile(mdQaccum_cur, blkdQaccum_shape, (m_block,))
+                gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_slice(tidx)
+                tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum)
+                zero = cute.make_fragment_like(tdQgdQaccum)
+                zero.fill(0.0)
+                cute.copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum)
+            if cutlass.const_expr(mLSE is not None):
+                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
+                    mLSElog2_cur = mLSElog2[batch_idx, head_idx, None]
+                else:
+                    mLSElog2_cur = cute.domain_offset((padded_offset_q,), mLSElog2[head_idx, None])
+                gLSElog2 = cute.local_tile(mLSElog2_cur, (self.m_block_size,), (m_block,))
+                LOG2_E = math.log2(math.e)
+                if tidx < seqlen_q_rounded - m_block * self.m_block_size:
+                    gLSElog2[tidx] = lse * LOG2_E if lse != -Float32.inf else 0.0

build/torch-cuda/flash_bwd_sm100.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/flash_bwd_sm90.py ADDED Viewed

	@@ -0,0 +1,1591 @@

+import math
+from typing import Callable, Optional, Type
+from functools import partial
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.hopper_helpers as sm90_utils_basic
+from cutlass.cute.nvgpu import cpasync, warpgroup
+from cutlass.cute import FastDivmodDivisor
+from cutlass import Float32, Int32, Boolean, const_expr
+from cutlass.utils import LayoutEnum
+from .quack import copy_utils
+from .quack import layout_utils
+from .quack import sm90_utils
+from .quack.sm90_utils import gemm_zero_init, gemm_w_idx
+from .cute_dsl_utils import assume_tensor_aligned
+from . import utils
+from .mask import AttentionMask
+from .seqlen_info import SeqlenInfoQK
+from .block_info import BlockInfo
+from . import pipeline
+from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import TileSchedulerArguments, SingleTileScheduler
+from .named_barrier import NamedBarrierBwd
+from .softmax import apply_score_mod_inner, apply_score_mod_bwd_inner
+from .block_sparsity import BlockSparseTensors
+from .block_sparse_utils import (
+    get_total_q_block_count_bwd,
+    produce_block_sparse_q_loads_bwd_sm90,
+    consume_block_sparse_mma_bwd_sm90,
+    dQaccum_store_block_sparse_bwd_sm90,
+)
+class FlashAttentionBackwardSm90:
+    arch = 90
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        head_dim: int,
+        head_dim_v: Optional[int] = None,
+        qhead_per_kvhead: int = 1,
+        is_causal: bool = False,
+        tile_m: int = 64,
+        tile_n: int = 128,
+        Q_stage: int = 2,
+        dO_stage: int = 2,
+        PdS_stage: int = 2,
+        SdP_swapAB: bool = False,
+        dKV_swapAB: bool = False,
+        dQ_swapAB: bool = False,
+        AtomLayoutMSdP: int = 1,
+        AtomLayoutNdKV: int = 2,
+        AtomLayoutMdQ: int = 1,
+        num_threads: int = 384,
+        V_in_regs: bool = False,
+        score_mod: cutlass.Constexpr | None = None,
+        score_mod_bwd: cutlass.Constexpr | None = None,
+        mask_mod: cutlass.Constexpr | None = None,
+        has_aux_tensors: cutlass.Constexpr = False,
+        subtile_factor: cutlass.Constexpr[int] = 1,
+    ):
+        self.dtype = dtype
+        # padding head_dim to a multiple of 16 as k_block_size
+        hdim_multiple_of = 16
+        self.tile_hdim = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
+        head_dim_v = head_dim_v if head_dim_v is not None else head_dim
+        self.same_hdim_kv = head_dim == head_dim_v
+        self.tile_hdimv = int(math.ceil(head_dim_v / hdim_multiple_of) * hdim_multiple_of)
+        # Can save registers (and hence be faster) if we don't have to check hdim predication
+        self.check_hdim_oob = head_dim != self.tile_hdim
+        self.check_hdim_v_oob = head_dim_v != self.tile_hdimv
+        self.qhead_per_kvhead = qhead_per_kvhead
+        self.is_causal = is_causal
+        self.is_local = False
+        self.tile_m = tile_m
+        self.tile_n = tile_n
+        self.num_threads = num_threads
+        self.Q_stage = Q_stage
+        self.dO_stage = dO_stage
+        self.PdS_stage = PdS_stage
+        assert self.dO_stage in [1, self.Q_stage]
+        assert self.PdS_stage in [1, self.Q_stage]
+        self.SdP_swapAB = SdP_swapAB
+        self.dKV_swapAB = dKV_swapAB
+        self.dQ_swapAB = dQ_swapAB
+        self.AtomLayoutMSdP = AtomLayoutMSdP
+        self.AtomLayoutNdKV = AtomLayoutNdKV
+        self.AtomLayoutMdQ = AtomLayoutMdQ
+        self.num_mma_warp_groups = (self.num_threads // 128) - 1
+        self.mma_dkv_is_rs = (
+            AtomLayoutMSdP == 1
+            and AtomLayoutNdKV == self.num_mma_warp_groups
+            and SdP_swapAB
+            and not dKV_swapAB
+        )
+        self.V_in_regs = V_in_regs
+        if qhead_per_kvhead > 1:
+            assert self.same_hdim_kv, "GQA backward requires head_dim == head_dim_v"
+            assert self.num_mma_warp_groups == 2, "GQA backward assumes 2 warp groups"
+        # These are tuned for speed
+        # Do we keep the LSE and dPsum in each thread, or split them across 8 threads that share
+        # them and then shuffle to get the value whenever we need? This can reduce register
+        # pressure when SdP_swapAB, where each thread needs to keep statistics for (kBlockM / 4)
+        # rows. If !SdP_swapAB, each thread only needs to keep statistics for 2 rows.
+        # TODO: impl these for hdim 64
+        self.shuffle_LSE = self.SdP_swapAB and self.tile_hdim <= 64
+        self.shuffle_dPsum = self.SdP_swapAB and self.tile_hdim <= 64
+        self.buffer_align_bytes = 1024
+        self.score_mod = score_mod
+        self.score_mod_bwd = score_mod_bwd
+        self.mask_mod = mask_mod
+        self.has_aux_tensors = has_aux_tensors
+        self.subtile_factor = subtile_factor
+        if cutlass.const_expr(has_aux_tensors):
+            self.vec_size: cutlass.Constexpr = 1
+        else:
+            self.vec_size: cutlass.Constexpr = 4
+        self.qk_acc_dtype = Float32
+    @staticmethod
+    def can_implement(
+        dtype,
+        head_dim,
+        head_dim_v,
+        tile_m,
+        tile_n,
+        Q_stage,
+        num_threads,
+        V_in_regs=False,
+    ) -> bool:
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if head_dim_v % 8 != 0:
+            return False
+        if tile_n % 16 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        if (tile_m * 2) % num_threads != 0:
+            return False
+        return True
+    def _check_type(
+        self,
+        mQ_type: Type[cutlass.Numeric],
+        mK_type: Type[cutlass.Numeric],
+        mV_type: Type[cutlass.Numeric],
+        mdO_type: Type[cutlass.Numeric],
+        mLSE_type: Type[cutlass.Numeric],
+        mdPsum_type: Type[cutlass.Numeric],
+        mdQaccum_type: Type[cutlass.Numeric],
+        mdK_type: Type[cutlass.Numeric],
+        mdV_type: Type[cutlass.Numeric],
+    ):
+        # Get the data type and check if it is fp16 or bf16
+        if const_expr(not (mQ_type == mK_type == mV_type == mdO_type)):
+            raise TypeError("All tensors must have the same data type")
+        if const_expr(mQ_type not in [cutlass.Float16, cutlass.BFloat16]):
+            raise TypeError("Only Float16 or BFloat16 is supported")
+        if const_expr(mLSE_type not in [Float32]):
+            raise TypeError("LSE tensor must be Float32")
+        if const_expr(mdPsum_type not in [Float32]):
+            raise TypeError("dPsum tensor must be Float32")
+        if const_expr(mdQaccum_type not in [Float32]):
+            raise TypeError("dQaccum tensor must be Float32")
+        if const_expr(self.qhead_per_kvhead == 1):
+            if const_expr(not (mdK_type == mdV_type == mQ_type)):
+                raise TypeError("mdK and mdV tensors must have the same data type as mQ")
+        else:
+            if const_expr(not (mdK_type == mdV_type == Float32)):
+                raise TypeError("mdKaccum and mdVaccum tensors must have the data type Float32")
+        assert mQ_type == self.dtype
+    def _setup_attributes(self):
+        self.sQ_layout, self.sK_layout, self.sV_layout, self.sdO_layout, self.sPdS_layout = [
+            sm90_utils.make_smem_layout(self.dtype, LayoutEnum.ROW_MAJOR, shape, stage)
+            for shape, stage in [
+                ((self.tile_m, self.tile_hdim), self.Q_stage),
+                ((self.tile_n, self.tile_hdim), None),
+                ((self.tile_n, self.tile_hdimv), None),
+                ((self.tile_m, self.tile_hdimv), self.dO_stage),
+                ((self.tile_m, self.tile_n), self.PdS_stage),
+            ]
+        ]
+        self.sdQaccum_layout = cute.make_layout(
+            (self.tile_m * self.tile_hdim // self.num_mma_warp_groups, self.num_mma_warp_groups)
+        )
+        # dQaccum R->S
+        self.r2s_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
+            cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
+            # thr_layout
+            cute.make_layout((self.num_threads_per_warp_group, self.num_mma_warp_groups)),
+            cute.make_layout(128 // Float32.width),  # val_layout
+        )
+        # dKVaccum for GQA epilogue - reuses sV+sK memory recast as f32
+        # TODO: assert that sVaccum and sKaccum don't overflow smem
+    def _get_tiled_mma(self):
+        # S = Q @ K.T, dP = dO @ V.T
+        atom_layout_SdP = (self.AtomLayoutMSdP, self.num_mma_warp_groups // self.AtomLayoutMSdP)
+        tiler_mn_SdP = (self.tile_m // atom_layout_SdP[0], self.tile_n // atom_layout_SdP[1])
+        tiled_mma_SdP = sm90_utils_basic.make_trivial_tiled_mma(
+            self.dtype,
+            self.dtype,
+            warpgroup.OperandMajorMode.K,
+            warpgroup.OperandMajorMode.K,
+            Float32,
+            atom_layout_mnk=(atom_layout_SdP if not self.SdP_swapAB else atom_layout_SdP[::-1])
+            + (1,),
+            tiler_mn=tiler_mn_SdP if not self.SdP_swapAB else tiler_mn_SdP[::-1],
+        )
+        # dV = P.T @ dO, dK = dS.T @ Q
+        atom_layout_dKV = (self.AtomLayoutNdKV, self.num_mma_warp_groups // self.AtomLayoutNdKV)
+        tiler_mn_dK = (self.tile_n // atom_layout_dKV[0], self.tile_hdim // atom_layout_dKV[1])
+        tiler_mn_dV = (self.tile_n // atom_layout_dKV[0], self.tile_hdimv // atom_layout_dKV[1])
+        tiled_mma_dK, tiled_mma_dV = [
+            sm90_utils_basic.make_trivial_tiled_mma(
+                self.dtype,
+                self.dtype,
+                warpgroup.OperandMajorMode.MN
+                if not self.mma_dkv_is_rs
+                else warpgroup.OperandMajorMode.K,
+                warpgroup.OperandMajorMode.MN,
+                Float32,
+                atom_layout_mnk=(atom_layout_dKV if not self.dKV_swapAB else atom_layout_dKV[::-1])
+                + (1,),
+                tiler_mn=tiler_mn_d if not self.dKV_swapAB else tiler_mn_d[::-1],
+                a_source=warpgroup.OperandSource.RMEM
+                if self.mma_dkv_is_rs
+                else warpgroup.OperandSource.SMEM,
+            )
+            for tiler_mn_d in (tiler_mn_dK, tiler_mn_dV)
+        ]
+        # dQ = dS @ K
+        atom_layout_dQ = (self.AtomLayoutMdQ, self.num_mma_warp_groups // self.AtomLayoutMdQ)
+        tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
+        tiled_mma_dQ = sm90_utils_basic.make_trivial_tiled_mma(
+            self.dtype,
+            self.dtype,
+            warpgroup.OperandMajorMode.K if not self.dQ_swapAB else warpgroup.OperandMajorMode.MN,
+            warpgroup.OperandMajorMode.MN if not self.dQ_swapAB else warpgroup.OperandMajorMode.K,
+            Float32,
+            atom_layout_mnk=(atom_layout_dQ if not self.dQ_swapAB else atom_layout_dQ[::-1]) + (1,),
+            tiler_mn=tiler_mn_dQ if not self.dQ_swapAB else tiler_mn_dQ[::-1],
+        )
+        return tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ
+    def _get_shared_storage_cls(self):
+        sQ_struct, sK_struct, sV_struct, sdO_struct, sdQaccum_struct = [
+            cute.struct.Align[cute.struct.MemRange[t, cute.cosize(layout)], self.buffer_align_bytes]
+            for (layout, t) in [
+                (self.sQ_layout, self.dtype),
+                (self.sK_layout, self.dtype),
+                (self.sV_layout, self.dtype),
+                (self.sdO_layout, self.dtype),
+                (self.sdQaccum_layout, Float32),
+            ]
+        ]
+        cosize_sdS = cute.cosize(self.sPdS_layout)
+        cosize_sP = cute.cosize(self.sPdS_layout) if const_expr(not self.mma_dkv_is_rs) else 0
+        sLSE_struct = cute.struct.Align[
+            cute.struct.MemRange[Float32, cute.round_up(self.tile_m, 64) * self.Q_stage], 128
+        ]
+        sdPsum_struct = cute.struct.Align[
+            cute.struct.MemRange[Float32, cute.round_up(self.tile_m, 64) * self.dO_stage], 128
+        ]
+        @cute.struct
+        class SharedStorageQKV:
+            mbar_ptr_Q: cute.struct.MemRange[cutlass.Int64, self.Q_stage * 2]
+            mbar_ptr_dO: cute.struct.MemRange[cutlass.Int64, self.dO_stage * 2]
+            sLSE: sLSE_struct
+            sdPsum: sdPsum_struct
+            sQ: sQ_struct
+            sV: sV_struct
+            sK: sK_struct
+            sdO: sdO_struct
+            sP: cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sP], 1024]
+            sdS: cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sdS], 1024]
+            sdQaccum: sdQaccum_struct
+        return SharedStorageQKV
+    @cute.jit
+    def __call__(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mdO: cute.Tensor,
+        mLSE: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mdQaccum: cute.Tensor,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        softmax_scale: Float32,
+        stream: cuda.CUstream,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        softcap: Float32 | float | None = None,
+        window_size_left: Int32 | int | None = None,
+        window_size_right: Int32 | int | None = None,
+        mdQ_semaphore: Optional[cute.Tensor] = None,
+        mdK_semaphore: Optional[cute.Tensor] = None,
+        mdV_semaphore: Optional[cute.Tensor] = None,
+        aux_tensors: Optional[list] = None,
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+    ):
+        assert mdQ_semaphore is None and mdK_semaphore is None and mdV_semaphore is None, (
+            "determinism not supported yet for Sm90"
+        )
+        self._check_type(
+            *(
+                t.element_type if t is not None else None
+                for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV)
+            )
+        )
+        mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV = [
+            assume_tensor_aligned(t) for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV)
+        ]
+        layout_transpose = [1, 3, 2, 0]  # (b, s, n, h) --> (s, h, n, b)
+        mQ, mK, mV, mdO = [layout_utils.select(t, layout_transpose) for t in (mQ, mK, mV, mdO)]
+        if const_expr(self.qhead_per_kvhead == 1):
+            mdK, mdV = [layout_utils.select(t, layout_transpose) for t in (mdK, mdV)]
+        else:
+            accum_transpose = [2, 1, 0]  # (b, n, s*h) -> (s*h, n, b)
+            mdK, mdV = [layout_utils.select(t, accum_transpose) for t in (mdK, mdV)]
+        LSE_dPsum_dQaccum_transpose = [2, 1, 0]  # (b, n, s) -> (s, n, b)
+        mLSE, mdPsum, mdQaccum = [
+            layout_utils.select(t, LSE_dPsum_dQaccum_transpose) for t in (mLSE, mdPsum, mdQaccum)
+        ]
+        tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ = self._get_tiled_mma()
+        self.num_mma_threads = tiled_mma_SdP.size
+        assert self.num_mma_threads + 128 == self.num_threads
+        self.num_threads_per_warp_group = 128
+        self.num_producer_threads = 32
+        self.num_mma_regs = 240
+        self.num_producer_regs = 24
+        # self.num_mma_regs = 232
+        # self.num_producer_regs = 40
+        self._setup_attributes()
+        SharedStorage = self._get_shared_storage_cls()
+        self.tma_copy_bytes = {
+            name: cute.size_in_bytes(mX.element_type, cute.select(layout, mode=[0, 1]))
+            for name, mX, layout in [
+                ("Q", mQ, self.sQ_layout),
+                ("K", mK, self.sK_layout),
+                ("V", mV, self.sV_layout),
+                ("dO", mdO, self.sdO_layout),
+            ]
+        }
+        self.tma_copy_bytes["LSE"] = self.tile_m * Float32.width // 8
+        self.tma_copy_bytes["dPsum"] = self.tile_m * Float32.width // 8
+        self.tma_copy_bytes["dQ"] = (
+            self.tile_m * self.tile_hdim * Float32.width // 8 // self.num_mma_warp_groups
+        )
+        self.tma_copy_bytes["dKacc"] = self.tile_n * self.tile_hdim * Float32.width // 8
+        self.tma_copy_bytes["dVacc"] = self.tile_n * self.tile_hdimv * Float32.width // 8
+        tma_atom_Q, tma_tensor_Q = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileG2SOp(),
+            mQ,
+            cute.select(self.sQ_layout, mode=[0, 1]),
+            (self.tile_m, self.tile_hdim),
+        )
+        tma_atom_K, tma_tensor_K = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileG2SOp(),
+            mK,
+            cute.select(self.sK_layout, mode=[0, 1]),
+            (self.tile_n, self.tile_hdim),
+        )
+        tma_atom_V, tma_tensor_V = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileG2SOp(),
+            mV,
+            cute.select(self.sV_layout, mode=[0, 1]),
+            (self.tile_n, self.tile_hdimv),
+        )
+        tma_atom_dO, tma_tensor_dO = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileG2SOp(),
+            mdO,
+            cute.select(self.sdO_layout, mode=[0, 1]),
+            (self.tile_m, self.tile_hdimv),
+        )
+        if const_expr(self.qhead_per_kvhead == 1):
+            tma_atom_dK, tma_tensor_dK = cpasync.make_tiled_tma_atom(
+                cpasync.CopyBulkTensorTileS2GOp(),
+                mdK,
+                cute.select(self.sK_layout, mode=[0, 1]),
+                (self.tile_n, self.tile_hdim),
+            )
+            tma_atom_dV, tma_tensor_dV = cpasync.make_tiled_tma_atom(
+                cpasync.CopyBulkTensorTileS2GOp(),
+                mdV,
+                cute.select(self.sV_layout, mode=[0, 1]),
+                (self.tile_n, self.tile_hdimv),
+            )
+        else:
+            tma_atom_dK = tma_atom_dV = tma_tensor_dK = tma_tensor_dV = None
+        TileScheduler = SingleTileScheduler
+        tile_sched_args = TileSchedulerArguments(
+            cute.ceil_div(cute.size(mK.shape[0]), self.tile_n),
+            cute.size(mQ.shape[2]),
+            cute.size(mQ.shape[3]),
+            1,  # num_splits
+            cute.size(mK.shape[0]),
+            mQ.shape[1],
+            mV.shape[1],
+            total_q=cute.size(mQ.shape[0]) * cute.size(mQ.shape[3]),
+            tile_shape_mn=(self.tile_m, self.tile_n),
+            mCuSeqlensQ=None,
+            mSeqUsedQ=None,
+            qhead_per_kvhead_packgqa=1,
+            element_size=self.dtype.width // 8,
+            is_persistent=False,
+            lpt=False,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        LOG2_E = math.log2(math.e)
+        if const_expr(self.score_mod is None):
+            softmax_scale_log2 = softmax_scale * LOG2_E
+        else:
+            softmax_scale_log2 = LOG2_E
+        fastdiv_mods = None
+        if const_expr(aux_tensors is not None):
+            seqlen_q = cute.size(mQ.shape[0])
+            seqlen_k = cute.size(mK.shape[0])
+            seqlen_q_divmod = FastDivmodDivisor(seqlen_q)
+            seqlen_k_divmod = FastDivmodDivisor(seqlen_k)
+            fastdiv_mods = (seqlen_q_divmod, seqlen_k_divmod)
+        qhead_per_kvhead_divmod = None
+        if const_expr(self.qhead_per_kvhead > 1):
+            qhead_per_kvhead_divmod = FastDivmodDivisor(self.qhead_per_kvhead)
+        self.use_block_sparsity = cutlass.const_expr(blocksparse_tensors is not None)
+        self.kernel(
+            tma_tensor_Q,
+            tma_tensor_K,
+            tma_tensor_V,
+            tma_tensor_dO,
+            tma_tensor_dK if const_expr(self.qhead_per_kvhead == 1) else mdK,
+            tma_tensor_dV if const_expr(self.qhead_per_kvhead == 1) else mdV,
+            tma_atom_Q,
+            tma_atom_K,
+            tma_atom_V,
+            tma_atom_dO,
+            tma_atom_dK,
+            tma_atom_dV,
+            mLSE,
+            mdPsum,
+            mdQaccum,
+            self.sQ_layout,
+            self.sK_layout,
+            self.sV_layout,
+            self.sPdS_layout,
+            self.sdO_layout,
+            self.sdQaccum_layout,
+            self.r2s_tiled_copy_dQaccum,
+            tiled_mma_SdP,
+            tiled_mma_dK,
+            tiled_mma_dV,
+            tiled_mma_dQ,
+            softmax_scale_log2,
+            softmax_scale,
+            tile_sched_params,
+            TileScheduler,
+            SharedStorage,
+            aux_tensors,
+            fastdiv_mods,
+            blocksparse_tensors,
+            qhead_per_kvhead_divmod,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mdO: cute.Tensor,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        tma_atom_Q: cute.CopyAtom,
+        tma_atom_K: cute.CopyAtom,
+        tma_atom_V: cute.CopyAtom,
+        tma_atom_dO: cute.CopyAtom,
+        tma_atom_dK: cute.CopyAtom,
+        tma_atom_dV: cute.CopyAtom,
+        mLSE: cute.Tensor,
+        mdPsum: cute.Tensor,
+        mdQaccum: cute.Tensor,
+        sQ_layout: cute.ComposedLayout,
+        sK_layout: cute.ComposedLayout,
+        sV_layout: cute.ComposedLayout,
+        sPdS_layout: cute.ComposedLayout,
+        sdO_layout: cute.ComposedLayout,
+        sdQaccum_layout: cute.Layout,
+        r2s_tiled_copy_dQaccum: cute.TiledCopy,
+        tiled_mma_SdP: cute.TiledMma,
+        tiled_mma_dK: cute.TiledMma,
+        tiled_mma_dV: cute.TiledMma,
+        tiled_mma_dQ: cute.TiledMma,
+        softmax_scale_log2,
+        softmax_scale,
+        tile_sched_params: ParamsBase,
+        TileScheduler: cutlass.Constexpr[Callable],
+        SharedStorage: cutlass.Constexpr[Callable],
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=(None, None),
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+    ):
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # prefetch TMA descriptors
+        if warp_idx == 0:
+            cpasync.prefetch_descriptor(tma_atom_Q)
+            cpasync.prefetch_descriptor(tma_atom_K)
+            cpasync.prefetch_descriptor(tma_atom_V)
+            cpasync.prefetch_descriptor(tma_atom_dO)
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(SharedStorage)
+        pipeline_producer_group = cutlass.pipeline.CooperativeGroup(cutlass.pipeline.Agent.Thread)
+        pipeline_consumer_group = cutlass.pipeline.CooperativeGroup(
+            cutlass.pipeline.Agent.Thread, self.num_mma_threads // cute.arch.WARP_SIZE
+        )
+        pipeline_Q = pipeline.PipelineTmaAsync.create(
+            barrier_storage=storage.mbar_ptr_Q.data_ptr(),
+            num_stages=self.Q_stage,
+            producer_group=pipeline_producer_group,
+            consumer_group=pipeline_consumer_group,
+            tx_count=self.tma_copy_bytes["Q"] + self.tma_copy_bytes["LSE"],
+            defer_sync=True,
+        )
+        pipeline_dO = pipeline.PipelineTmaAsync.create(
+            barrier_storage=storage.mbar_ptr_dO.data_ptr(),
+            num_stages=self.dO_stage,
+            producer_group=pipeline_producer_group,
+            consumer_group=pipeline_consumer_group,
+            tx_count=self.tma_copy_bytes["dO"] + self.tma_copy_bytes["dPsum"],
+            defer_sync=False,
+        )
+        sQ = storage.sQ.get_tensor(sQ_layout.outer, swizzle=sQ_layout.inner)
+        sdO = storage.sdO.get_tensor(sdO_layout.outer, swizzle=sdO_layout.inner)
+        sK = storage.sK.get_tensor(sK_layout.outer, swizzle=sK_layout.inner)
+        sV = storage.sV.get_tensor(sV_layout.outer, swizzle=sV_layout.inner)
+        sP = None
+        if const_expr(not self.mma_dkv_is_rs):
+            sP = storage.sP.get_tensor(sPdS_layout.outer, swizzle=sPdS_layout.inner)
+        sdS = storage.sdS.get_tensor(sPdS_layout.outer, swizzle=sPdS_layout.inner)
+        sLSE = storage.sLSE.get_tensor(
+            cute.make_layout(
+                (self.tile_m, self.Q_stage),
+                stride=(1, cute.round_up(self.tile_m, 64)),
+            )
+        )
+        sdPsum = storage.sdPsum.get_tensor(
+            cute.make_layout(
+                (self.tile_m, self.dO_stage),
+                stride=(1, cute.round_up(self.tile_m, 64)),
+            )
+        )
+        sdQaccum = storage.sdQaccum.get_tensor(sdQaccum_layout)
+        block_info = BlockInfo(
+            self.tile_m,
+            self.tile_n,
+            self.is_causal,
+            self.is_local,
+            False,  # is_split_kv
+            None,
+            None,
+            qhead_per_kvhead_packgqa=1,
+        )
+        SeqlenInfoCls = partial(
+            SeqlenInfoQK.create,
+            seqlen_q_static=mQ.shape[0],
+            seqlen_k_static=mK.shape[0],
+            mCuSeqlensQ=None,
+            mCuSeqlensK=None,
+            mSeqUsedQ=None,
+            mSeqUsedK=None,
+        )
+        AttentionMaskCls = partial(
+            AttentionMask,
+            self.tile_m,
+            self.tile_n,
+            window_size_left=None,
+            window_size_right=None,
+            swap_AB=self.SdP_swapAB,
+        )
+        TileSchedulerCls = partial(TileScheduler.create, tile_sched_params)
+        if warp_idx < 4:
+            cute.arch.setmaxregister_decrease(self.num_producer_regs)
+            if warp_idx == 0:
+                self.load(
+                    mQ,
+                    mK,
+                    mV,
+                    mdO,
+                    mLSE,
+                    mdPsum,
+                    sQ,
+                    sK,
+                    sV,
+                    sdO,
+                    sLSE,
+                    sdPsum,
+                    tma_atom_Q,
+                    tma_atom_K,
+                    tma_atom_V,
+                    tma_atom_dO,
+                    pipeline_Q,
+                    pipeline_dO,
+                    block_info,
+                    SeqlenInfoCls,
+                    TileSchedulerCls,
+                    blocksparse_tensors,
+                    qhead_per_kvhead_divmod,
+                )
+            if warp_idx == 1:
+                self.dQaccum_store(
+                    mdQaccum,
+                    sdQaccum,
+                    block_info,
+                    TileSchedulerCls,
+                    SeqlenInfoCls,
+                    blocksparse_tensors,
+                )
+        else:
+            cute.arch.setmaxregister_increase(self.num_mma_regs)
+            tidx, _, _ = cute.arch.thread_idx()
+            tidx = tidx - 128
+            self.mma(
+                tiled_mma_SdP,
+                tiled_mma_dK,
+                tiled_mma_dV,
+                tiled_mma_dQ,
+                mdK,
+                mdV,
+                mdQaccum,
+                sQ,
+                sK,
+                sV,
+                sdO,
+                sP,
+                sdS,
+                sLSE,
+                sdPsum,
+                sdQaccum,
+                pipeline_Q,
+                pipeline_dO,
+                tidx,
+                tma_atom_dK,
+                tma_atom_dV,
+                r2s_tiled_copy_dQaccum,
+                softmax_scale_log2,
+                softmax_scale,
+                block_info,
+                SeqlenInfoCls,
+                AttentionMaskCls,
+                TileSchedulerCls,
+                aux_tensors,
+                fastdiv_mods,
+                blocksparse_tensors,
+                qhead_per_kvhead_divmod,
+            )
+    @cute.jit
+    def load(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mdO: cute.Tensor,
+        mLSE: cute.Tensor,
+        mdPsum: cute.Tensor,
+        sQ: cute.Tensor,
+        sK: cute.Tensor,
+        sV: cute.Tensor,
+        sdO: cute.Tensor,
+        sLSE: cute.Tensor,
+        sdPsum: cute.Tensor,
+        tma_atom_Q: cute.CopyAtom,
+        tma_atom_K: cute.CopyAtom,
+        tma_atom_V: cute.CopyAtom,
+        tma_atom_dO: cute.CopyAtom,
+        pipeline_Q: cutlass.pipeline.PipelineAsync,
+        pipeline_dO: cutlass.pipeline.PipelineAsync,
+        block_info: BlockInfo,
+        SeqlenInfoCls: Callable,
+        TileSchedulerCls: Callable,
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+    ):
+        warp_idx_in_wg = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
+        if warp_idx_in_wg == 0:
+            producer_state_Q = cutlass.pipeline.make_pipeline_state(
+                cutlass.pipeline.PipelineUserType.Producer, self.Q_stage
+            )
+            producer_state_dO = cutlass.pipeline.make_pipeline_state(
+                cutlass.pipeline.PipelineUserType.Producer, self.dO_stage
+            )
+            tile_scheduler = TileSchedulerCls()
+            work_tile = tile_scheduler.initial_work_tile_info()
+            while work_tile.is_valid_tile:
+                n_block, head_idx, batch_idx, _ = work_tile.tile_idx
+                seqlen = SeqlenInfoCls(batch_idx)
+                head_idx_kv = (
+                    head_idx
+                    if const_expr(self.qhead_per_kvhead == 1)
+                    else head_idx // qhead_per_kvhead_divmod
+                )
+                mK_cur = mK[None, None, head_idx_kv, batch_idx]
+                gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
+                mV_cur = mV[None, None, head_idx_kv, batch_idx]
+                gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
+                mQ_cur = mQ[None, None, head_idx, batch_idx]
+                gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (None, 0))
+                mdO_cur = mdO[None, None, head_idx, batch_idx]
+                gdO = cute.local_tile(mdO_cur, (self.tile_m, self.tile_hdimv), (None, 0))
+                mLSE_cur = mLSE[None, head_idx, batch_idx]
+                gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (None,))
+                mdPsum_cur = mdPsum[None, head_idx, batch_idx]
+                gdPsum = cute.local_tile(mdPsum_cur, (self.tile_m,), (None,))
+                load_K, _, _ = copy_utils.tma_get_copy_fn(
+                    tma_atom_K, 0, cute.make_layout(1), gK, sK, single_stage=True
+                )
+                load_V, _, _ = copy_utils.tma_get_copy_fn(
+                    tma_atom_V, 0, cute.make_layout(1), gV, sV, single_stage=True
+                )
+                load_Q, _, _ = copy_utils.tma_get_copy_fn(
+                    tma_atom_Q, 0, cute.make_layout(1), gQ, sQ
+                )
+                load_Q = copy_utils.tma_producer_copy_fn(load_Q, pipeline_Q)
+                load_dO, _, _ = copy_utils.tma_get_copy_fn(
+                    tma_atom_dO, 0, cute.make_layout(1), gdO, sdO
+                )
+                load_dO = copy_utils.tma_producer_copy_fn(load_dO, pipeline_dO)
+                load_LSE = copy_utils.cpasync_bulk_get_copy_fn(gLSE, sLSE)
+                load_LSE = copy_utils.tma_producer_copy_fn(load_LSE, pipeline_Q)
+                load_dPsum = copy_utils.cpasync_bulk_get_copy_fn(gdPsum, sdPsum)
+                load_dPsum = copy_utils.tma_producer_copy_fn(load_dPsum, pipeline_dO)
+                m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
+                if const_expr(not self.use_block_sparsity):
+                    total_m_block_cnt = m_block_max - m_block_min
+                    process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
+                else:
+                    total_m_block_cnt = get_total_q_block_count_bwd(
+                        blocksparse_tensors,
+                        batch_idx,
+                        head_idx,
+                        n_block,
+                        subtile_factor=self.subtile_factor,
+                        m_block_max=m_block_max,
+                    )
+                    process_tile = total_m_block_cnt > Int32(0)
+                if process_tile:
+                    if const_expr(not self.use_block_sparsity):
+                        first_m_block = m_block_min
+                        pipeline_Q.producer_acquire(
+                            producer_state_Q, extra_tx_count=self.tma_copy_bytes["K"]
+                        )
+                        load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q))
+                        load_Q(first_m_block, producer_state=producer_state_Q)
+                        load_LSE(first_m_block, producer_state=producer_state_Q)
+                        producer_state_dO_cur = (
+                            producer_state_dO
+                            if const_expr(self.Q_stage != self.dO_stage)
+                            else producer_state_Q
+                        )
+                        pipeline_dO.producer_acquire(
+                            producer_state_dO_cur, extra_tx_count=self.tma_copy_bytes["V"]
+                        )
+                        load_V(tma_bar_ptr=pipeline_dO.producer_get_barrier(producer_state_dO_cur))
+                        load_dO(first_m_block, producer_state=producer_state_dO_cur)
+                        load_dPsum(first_m_block, producer_state=producer_state_dO_cur)
+                        producer_state_Q.advance()
+                        producer_state_dO.advance()
+                        for m_block in cutlass.range(m_block_min + 1, m_block_max, unroll=1):
+                            pipeline_Q.producer_acquire(producer_state_Q)
+                            load_Q(m_block, producer_state=producer_state_Q)
+                            load_LSE(m_block, producer_state=producer_state_Q)
+                            producer_state_dO_cur = (
+                                producer_state_dO
+                                if const_expr(self.Q_stage != self.dO_stage)
+                                else producer_state_Q
+                            )
+                            pipeline_dO.producer_acquire(producer_state_dO_cur)
+                            load_dO(m_block, producer_state=producer_state_dO_cur)
+                            load_dPsum(m_block, producer_state=producer_state_dO_cur)
+                            producer_state_Q.advance()
+                            producer_state_dO.advance()
+                    else:
+                        producer_state_Q, producer_state_dO = produce_block_sparse_q_loads_bwd_sm90(
+                            blocksparse_tensors,
+                            batch_idx,
+                            head_idx,
+                            n_block,
+                            producer_state_Q,
+                            producer_state_dO,
+                            pipeline_Q,
+                            pipeline_dO,
+                            load_K,
+                            load_V,
+                            load_Q,
+                            load_dO,
+                            load_LSE,
+                            load_dPsum,
+                            self.tma_copy_bytes["K"],
+                            self.tma_copy_bytes["V"],
+                            Q_stage_eq_dO_stage=(self.Q_stage == self.dO_stage),
+                            subtile_factor=self.subtile_factor,
+                            m_block_max=m_block_max,
+                        )
+                tile_scheduler.prefetch_next_work()
+                tile_scheduler.advance_to_next_work()
+                work_tile = tile_scheduler.get_current_work()
+    @cute.jit
+    def apply_score_mod(
+        self,
+        acc_S: cute.Tensor,
+        thr_mma_SdP: cute.core.ThrMma,
+        batch_idx,
+        head_idx,
+        m_block,
+        n_block,
+        softmax_scale,
+        seqlen_info: SeqlenInfoQK,
+        aux_tensors=None,
+        fastdiv_mods=(None, None),
+    ):
+        # [NOTE] SdP_swapAB: swapAB transposes the tile, so use (n, m) indexing
+        cS = cute.make_identity_tensor(
+            (self.tile_n, self.tile_m) if self.SdP_swapAB else (self.tile_m, self.tile_n)
+        )
+        cS = cute.domain_offset(
+            (n_block * self.tile_n, m_block * self.tile_m)
+            if self.SdP_swapAB
+            else (m_block * self.tile_m, n_block * self.tile_n),
+            cS,
+        )
+        tScS = thr_mma_SdP.partition_C(cS)
+        apply_score_mod_inner(
+            acc_S,
+            tScS,
+            self.score_mod,
+            batch_idx,
+            head_idx,
+            softmax_scale,
+            self.vec_size,
+            self.qk_acc_dtype,
+            aux_tensors,
+            fastdiv_mods,
+            seqlen_info,
+            constant_q_idx=None,
+            qhead_per_kvhead=self.qhead_per_kvhead,
+            transpose_indices=self.SdP_swapAB,
+        )
+    @cute.jit
+    def apply_score_mod_bwd(
+        self,
+        grad_tensor: cute.Tensor,
+        score_tensor: cute.Tensor,
+        thr_mma_SdP: cute.core.ThrMma,
+        batch_idx,
+        head_idx,
+        m_block,
+        n_block,
+        softmax_scale,
+        seqlen_info: SeqlenInfoQK,
+        aux_tensors=None,
+        fastdiv_mods=(None, None),
+    ):
+        cS = cute.make_identity_tensor(
+            (self.tile_n, self.tile_m) if self.SdP_swapAB else (self.tile_m, self.tile_n)
+        )
+        cS = cute.domain_offset(
+            (n_block * self.tile_n, m_block * self.tile_m)
+            if self.SdP_swapAB
+            else (m_block * self.tile_m, n_block * self.tile_n),
+            cS,
+        )
+        tScS = thr_mma_SdP.partition_C(cS)
+        apply_score_mod_bwd_inner(
+            grad_tensor,
+            score_tensor,
+            tScS,
+            self.score_mod_bwd,
+            batch_idx,
+            head_idx,
+            softmax_scale,
+            self.vec_size,
+            self.qk_acc_dtype,
+            aux_tensors,
+            fastdiv_mods,
+            seqlen_info,
+            constant_q_idx=None,
+            qhead_per_kvhead=self.qhead_per_kvhead,
+            transpose_indices=self.SdP_swapAB,
+        )
+    @cute.jit
+    def mma(
+        self,
+        tiled_mma_SdP: cute.TiledMma,
+        tiled_mma_dK: cute.TiledMma,
+        tiled_mma_dV: cute.TiledMma,
+        tiled_mma_dQ: cute.TiledMma,
+        mdK: cute.Tensor,
+        mdV: cute.Tensor,
+        mdQaccum: cute.Tensor,
+        sQ: cute.Tensor,
+        sK: cute.Tensor,
+        sV: cute.Tensor,
+        sdO: cute.Tensor,
+        sP: Optional[cute.Tensor],
+        sdS: cute.Tensor,
+        sLSE: cute.Tensor,
+        sdPsum: cute.Tensor,
+        sdQaccum: cute.Tensor,
+        pipeline_Q: cutlass.pipeline.PipelineAsync,
+        pipeline_dO: cutlass.pipeline.PipelineAsync,
+        tidx: Int32,
+        tma_atom_dK: cute.CopyAtom,
+        tma_atom_dV: cute.CopyAtom,
+        r2s_tiled_copy_dQaccum: cute.TiledCopy,
+        softmax_scale_log2: Float32,
+        softmax_scale: Float32,
+        block_info: BlockInfo,
+        SeqlenInfoCls: Callable,
+        AttentionMaskCls: Callable,
+        TileSchedulerCls: Callable,
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=(None, None),
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+    ):
+        warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
+        warp_group_thread_layout = cute.make_layout(
+            self.num_mma_warp_groups, stride=self.num_threads_per_warp_group
+        )
+        thr_mma_SdP = tiled_mma_SdP.get_slice(tidx)
+        wg_mma_SdP = tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx))
+        wg_mma_dK = tiled_mma_dK.get_slice(warp_group_thread_layout(warp_group_idx))
+        wg_mma_dV = tiled_mma_dV.get_slice(warp_group_thread_layout(warp_group_idx))
+        wg_mma_dQ = tiled_mma_dQ.get_slice(warp_group_thread_layout(warp_group_idx))
+        # S = Q @ K.T
+        shape_mnk_S = (self.tile_m, self.tile_n, self.tile_hdim)
+        _, tSrQ, tSrK = sm90_utils.partition_fragment_ABC(
+            wg_mma_SdP, shape_mnk_S, sQ, sK, swap_AB=self.SdP_swapAB
+        )
+        mma_qk_fn = partial(
+            gemm_zero_init, tiled_mma_SdP, shape_mnk_S[:2], tSrQ, tSrK, swap_AB=self.SdP_swapAB
+        )
+        # dP = dO @ V.T
+        shape_mnk_dP = (self.tile_m, self.tile_n, self.tile_hdimv)
+        _, tdPrdO, tdPrV = sm90_utils.partition_fragment_ABC(
+            wg_mma_SdP, shape_mnk_dP, sdO, sV, swap_AB=self.SdP_swapAB
+        )
+        mma_dov_fn = partial(
+            gemm_zero_init, tiled_mma_SdP, shape_mnk_dP[:2], tdPrdO, tdPrV, swap_AB=self.SdP_swapAB
+        )
+        # dV += P.T @ dO
+        sPt = layout_utils.transpose_view(sP) if sP is not None else None
+        sdOt = layout_utils.transpose_view(sdO)
+        shape_mnk_dV = (self.tile_n, self.tile_hdimv, self.tile_m)
+        acc_dV, tdVrPt, tdVrdOt = sm90_utils.partition_fragment_ABC(
+            wg_mma_dV, shape_mnk_dV, sPt, sdOt, swap_AB=self.dKV_swapAB
+        )
+        if const_expr(not self.mma_dkv_is_rs):
+            mma_pdo_fn = partial(
+                gemm_w_idx, tiled_mma_dV, acc_dV, tdVrPt, tdVrdOt, swap_AB=self.dKV_swapAB
+            )
+        else:
+            mma_pdo_fn = partial(gemm_w_idx, tiled_mma_dV, acc_dV, tCrB=tdVrdOt)
+        # dK += dS.T @ Q
+        sdSt = layout_utils.transpose_view(sdS)
+        sQt = layout_utils.transpose_view(sQ)
+        shape_mnk_dK = (self.tile_n, self.tile_hdim, self.tile_m)
+        acc_dK, tdKrdSt, tdKrQt = sm90_utils.partition_fragment_ABC(
+            wg_mma_dK, shape_mnk_dK, sdSt, sQt, swap_AB=self.dKV_swapAB
+        )
+        if const_expr(not self.mma_dkv_is_rs):
+            mma_dsq_fn = partial(
+                gemm_w_idx, tiled_mma_dK, acc_dK, tdKrdSt, tdKrQt, swap_AB=self.dKV_swapAB
+            )
+        else:
+            mma_dsq_fn = partial(gemm_w_idx, tiled_mma_dK, acc_dK, tCrB=tdKrQt)
+        # dQ = dS @ K
+        sKt = layout_utils.transpose_view(sK)
+        shape_mnk_dQ = (self.tile_m, self.tile_hdim, self.tile_n)
+        _, tdQrdS, tdQrKt = sm90_utils.partition_fragment_ABC(
+            wg_mma_dQ, shape_mnk_dQ, sdS, sKt, swap_AB=self.dQ_swapAB
+        )
+        mma_dsk_fn = partial(
+            gemm_zero_init, tiled_mma_dQ, shape_mnk_dQ[:2], tdQrdS, tdQrKt, swap_AB=self.dQ_swapAB
+        )
+        # Smem copy atom tiling
+        copy_P_r2s = None
+        if const_expr(sP is not None):
+            sP_cpy = sP if const_expr(not self.SdP_swapAB) else sPt
+            copy_P_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_SdP, sP_cpy, tidx, self.arch, transpose=self.SdP_swapAB
+            )
+        sdS_cpy = sdS if const_expr(not self.SdP_swapAB) else sdSt
+        copy_dS_r2s, _, _ = copy_utils.get_smem_store_C(
+            tiled_mma_SdP, sdS_cpy, tidx, self.arch, transpose=self.SdP_swapAB
+        )
+        tLSEsLSE = layout_utils.mma_partition_C_vec(
+            sLSE, thr_mma_SdP, expand_shape=self.tile_n, is_colvec=not self.SdP_swapAB
+        )
+        tLSEsdPsum = layout_utils.mma_partition_C_vec(
+            sdPsum, thr_mma_SdP, expand_shape=self.tile_n, is_colvec=not self.SdP_swapAB
+        )
+        smem_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_slice(tidx)
+        tdQsdQaccum = smem_thr_copy_dQaccum.partition_D(sdQaccum)
+        PdS_barrier = cutlass.pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierBwd.PdS), num_threads=self.num_mma_threads
+        )
+        score_mod_fn = partial(
+            self.apply_score_mod,
+            thr_mma_SdP=thr_mma_SdP,
+            softmax_scale=softmax_scale,
+            aux_tensors=aux_tensors,
+            fastdiv_mods=fastdiv_mods,
+        )
+        score_mod_bwd_fn = partial(
+            self.apply_score_mod_bwd,
+            thr_mma_SdP=thr_mma_SdP,
+            softmax_scale=softmax_scale,
+            aux_tensors=aux_tensors,
+            fastdiv_mods=fastdiv_mods,
+        )
+        mma_one_m_block_all = partial(
+            self.mma_one_m_block,
+            warp_group_idx=warp_group_idx,
+            mma_qk_fn=mma_qk_fn,
+            mma_dov_fn=mma_dov_fn,
+            mma_pdo_fn=mma_pdo_fn,
+            mma_dsq_fn=mma_dsq_fn,
+            mma_dsk_fn=mma_dsk_fn,
+            copy_P_r2s=copy_P_r2s,
+            copy_dS_r2s=copy_dS_r2s,
+            pipeline_Q=pipeline_Q,
+            pipeline_dO=pipeline_dO,
+            tLSEsLSE=tLSEsLSE,
+            tLSEsdPsum=tLSEsdPsum,
+            tdQsdQaccum=tdQsdQaccum,
+            softmax_scale_log2=softmax_scale_log2,
+            PdS_barrier=PdS_barrier,
+            # acc_dV=acc_dV,
+            # acc_dK=acc_dK,
+        )
+        consumer_state_Q = cutlass.pipeline.make_pipeline_state(
+            cutlass.pipeline.PipelineUserType.Consumer, self.Q_stage
+        )
+        consumer_state_dO = cutlass.pipeline.make_pipeline_state(
+            cutlass.pipeline.PipelineUserType.Consumer, self.dO_stage
+        )
+        tile_scheduler = TileSchedulerCls()
+        work_tile = tile_scheduler.initial_work_tile_info()
+        while work_tile.is_valid_tile:
+            n_block, head_idx, batch_idx, _ = work_tile.tile_idx
+            seqlen = SeqlenInfoCls(batch_idx)
+            mask = AttentionMaskCls(seqlen)
+            score_mod_fn_cur = partial(
+                score_mod_fn,
+                batch_idx=batch_idx,
+                head_idx=head_idx,
+                n_block=n_block,
+                seqlen_info=seqlen,
+            )
+            score_mod_bwd_fn_cur = partial(
+                score_mod_bwd_fn,
+                batch_idx=batch_idx,
+                head_idx=head_idx,
+                n_block=n_block,
+                seqlen_info=seqlen,
+            )
+            m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
+            if const_expr(not self.use_block_sparsity):
+                process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
+            else:
+                total_m_block_cnt = get_total_q_block_count_bwd(
+                    blocksparse_tensors,
+                    batch_idx,
+                    head_idx,
+                    n_block,
+                    subtile_factor=self.subtile_factor,
+                    m_block_max=m_block_max,
+                )
+                process_tile = total_m_block_cnt > Int32(0)
+            if process_tile:
+                if const_expr(not self.use_block_sparsity):
+                    mask_fn = partial(
+                        mask.apply_mask,
+                        batch_idx=batch_idx,
+                        head_idx=head_idx,
+                        n_block=n_block,
+                        thr_mma=thr_mma_SdP,
+                        mask_seqlen=True,
+                        mask_causal=self.is_causal,
+                        mask_local=self.is_local,
+                        mask_mod=self.mask_mod,
+                        aux_tensors=aux_tensors,
+                        fastdiv_mods=fastdiv_mods,
+                    )
+                    dKV_accumulate = False
+                    for m_block in cutlass.range(m_block_min, m_block_max, unroll=1):
+                        consumer_state_Q, consumer_state_dO = mma_one_m_block_all(
+                            m_block,
+                            consumer_state_Q,
+                            consumer_state_dO,
+                            mask_fn=mask_fn,
+                            score_mod_fn=score_mod_fn_cur,
+                            score_mod_bwd_fn=score_mod_bwd_fn_cur,
+                            dKV_accumulate=dKV_accumulate,
+                        )
+                        dKV_accumulate = True
+                else:
+                    consumer_state_Q, consumer_state_dO = consume_block_sparse_mma_bwd_sm90(
+                        blocksparse_tensors,
+                        batch_idx,
+                        head_idx,
+                        n_block,
+                        consumer_state_Q,
+                        consumer_state_dO,
+                        mma_one_m_block_all,
+                        mask,
+                        self.mask_mod,
+                        is_causal=self.is_causal,
+                        is_local=self.is_local,
+                        thr_mma_SdP=thr_mma_SdP,
+                        score_mod_fn=score_mod_fn_cur,
+                        score_mod_bwd_fn=score_mod_bwd_fn_cur,
+                        subtile_factor=self.subtile_factor,
+                        m_block_max=m_block_max,
+                        aux_tensors=aux_tensors,
+                        fastdiv_mods=fastdiv_mods,
+                    )
+                if const_expr(self.qhead_per_kvhead == 1):
+                    acc_dK.store(acc_dK.load() * softmax_scale)
+                self.epilogue_dKV(
+                    acc_dV,
+                    mdV,
+                    sV,
+                    acc_dK,
+                    mdK,
+                    sK,
+                    seqlen,
+                    tma_atom_dK,
+                    tma_atom_dV,
+                    tiled_mma_dK,
+                    tiled_mma_dV,
+                    tidx,
+                    n_block,
+                    head_idx,
+                    batch_idx,
+                    qhead_per_kvhead_divmod,
+                )
+            else:
+                # Block sparsity: KV tile with zero Q blocks produces no dK/dV; write zeros.
+                if const_expr(self.use_block_sparsity):
+                    acc_dK.fill(0.0)
+                    acc_dV.fill(0.0)
+                    self.epilogue_dKV(
+                        acc_dV,
+                        mdV,
+                        sV,
+                        acc_dK,
+                        mdK,
+                        sK,
+                        seqlen,
+                        tma_atom_dK,
+                        tma_atom_dV,
+                        tiled_mma_dK,
+                        tiled_mma_dV,
+                        tidx,
+                        n_block,
+                        head_idx,
+                        batch_idx,
+                        qhead_per_kvhead_divmod,
+                    )
+            tile_scheduler.advance_to_next_work()
+            work_tile = tile_scheduler.get_current_work()
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        if warp_idx == 4:
+            cute.arch.cp_async_bulk_wait_group(0, read=True)
+    @cute.jit
+    def mma_one_m_block(
+        self,
+        m_block: Int32,
+        consumer_state_Q: cutlass.pipeline.PipelineState | pipeline.PipelineStateSimple,
+        consumer_state_dO: cutlass.pipeline.PipelineState | pipeline.PipelineStateSimple,
+        warp_group_idx: Int32,
+        mma_qk_fn: Callable,
+        mma_dov_fn: Callable,
+        mma_pdo_fn: Callable,
+        mma_dsq_fn: Callable,
+        mma_dsk_fn: Callable,
+        copy_P_r2s: Optional[Callable],
+        copy_dS_r2s: Callable,
+        pipeline_Q: cutlass.pipeline.PipelineAsync,
+        pipeline_dO: cutlass.pipeline.PipelineAsync,
+        tLSEsLSE: cute.Tensor,
+        tLSEsdPsum: cute.Tensor,
+        tdQsdQaccum: cute.Tensor,
+        softmax_scale_log2: Float32,
+        PdS_barrier: cutlass.pipeline.NamedBarrier,
+        mask_fn: Optional[Callable] = None,
+        score_mod_fn: Optional[Callable] = None,
+        score_mod_bwd_fn: Optional[Callable] = None,
+        dKV_accumulate: Boolean = True,
+    ):
+        consumer_state_dO_cur = (
+            consumer_state_dO if const_expr(self.Q_stage == self.dO_stage) else consumer_state_Q
+        )
+        smem_idx_Q = consumer_state_Q.index
+        smem_idx_dO = consumer_state_dO_cur.index if const_expr(self.dO_stage > 1) else 0
+        smem_idx_PdS = smem_idx_Q if const_expr(self.PdS_stage > 1) else 0
+        # (1) [GEMM 1] S = Q @ K^T
+        pipeline_Q.consumer_wait(consumer_state_Q, pipeline_Q.consumer_try_wait(consumer_state_Q))
+        acc_S = mma_qk_fn(A_idx=smem_idx_Q, wg_wait=-1)
+        tLSErLSE = copy_utils.load_s2r(tLSEsLSE[None, smem_idx_Q])
+        # (2) [GEMM 2] dP = dO @ V.T
+        pipeline_dO.consumer_wait(
+            consumer_state_dO_cur, pipeline_dO.consumer_try_wait(consumer_state_dO_cur)
+        )
+        acc_dP = mma_dov_fn(A_idx=smem_idx_Q, wg_wait=1)
+        if const_expr(self.score_mod_bwd is not None):
+            acc_S_pre = cute.make_fragment_like(acc_S)
+            cute.autovec_copy(acc_S, acc_S_pre)
+        if const_expr(self.score_mod is not None):
+            score_mod_fn(acc_S, m_block=m_block)
+        # (3) [Pointwise 1] P = exp(S - LSE)
+        if cutlass.const_expr(mask_fn is not None):
+            mask_fn(acc_S, m_block=m_block)
+        acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S, transpose=self.SdP_swapAB)
+        for r in cutlass.range_constexpr(cute.size(acc_S_mn, mode=[0])):
+            for c in cutlass.range(cute.size(acc_S_mn, mode=[1]), unroll_full=True):
+                acc_S_mn[r, c] = cute.math.exp2(
+                    acc_S_mn[r, c] * softmax_scale_log2 - tLSErLSE[r], fastmath=True
+                )
+        tLSErdPsum = copy_utils.load_s2r(tLSEsdPsum[None, smem_idx_dO])
+        # Convert P from f32 -> f16
+        tdVrP = utils.cvt_f16(layout_utils.reshape_acc_to_frgA(acc_S), self.dtype)
+        # R2S for P
+        if const_expr(not self.mma_dkv_is_rs):
+            # sync to ensure P has already been used in the previous iteration before overwriting
+            if const_expr(self.PdS_stage == 1):
+                PdS_barrier.arrive_and_wait()
+            copy_P_r2s(tdVrP, dst_idx=smem_idx_PdS)
+        # (4) [Pointwise 2] dS = P*(dP-dPsum)
+        warpgroup.wait_group(0)
+        acc_dP_mn = layout_utils.reshape_acc_to_mn(acc_dP, transpose=self.SdP_swapAB)
+        for r in cutlass.range_constexpr(cute.size(acc_dP_mn, mode=[0])):
+            for c in cutlass.range(cute.size(acc_dP_mn, mode=[1]), unroll_full=True):
+                acc_dP_mn[r, c] = acc_S_mn[r, c] * (acc_dP_mn[r, c] - tLSErdPsum[r])
+        if const_expr(self.score_mod_bwd is not None):
+            score_mod_bwd_fn(acc_dP, acc_S_pre, m_block=m_block)
+        # Convert dS from f32 -> f16
+        tdKrdS = utils.cvt_f16(layout_utils.reshape_acc_to_frgA(acc_dP), self.dtype)
+        # If there's double buffering on dS, we don't need to sync here.
+        # Otherwise we might have WG1 writing to dS before WG2 is done reading from it during MmadQ.
+        # But because both WGs have to sync at the end of the loop and double buffering,
+        # this race condition is not possible.
+        # This sync is to ensure (1) P is written in case of !mma_dkv_is_rs and
+        # (2) dS is already read by the Mma in the previous iteration in case of mma_dkv_is_rs.
+        if const_expr(not self.mma_dkv_is_rs or (self.PdS_stage == 1 and self.mma_dkv_is_rs)):
+            cute.arch.fence_view_async_shared()
+            PdS_barrier.arrive_and_wait()
+        # R2S for dS
+        copy_dS_r2s(tdKrdS, dst_idx=smem_idx_PdS)
+        # (5) [GEMM 3] dV += P.T @ dO
+        if const_expr(not self.mma_dkv_is_rs):
+            mma_pdo_fn(
+                A_idx=smem_idx_PdS, B_idx=smem_idx_dO, zero_init=not dKV_accumulate, wg_wait=-1
+            )
+        else:
+            mma_pdo_fn(tCrA=tdVrP, B_idx=smem_idx_dO, zero_init=not dKV_accumulate, wg_wait=-1)
+        # smem fence to make sure sdS is written before it's read by WGMMA
+        cute.arch.fence_view_async_shared()
+        PdS_barrier.arrive_and_wait()
+        # (6) [GEMM 4] dQ = dS @ K
+        acc_dQ = mma_dsk_fn(A_idx=smem_idx_PdS, wg_wait=1)
+        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dV)
+        pipeline_dO.consumer_release(consumer_state_dO_cur)  # release dO as dV mma is done
+        # (7) [GEMM 5] dK += dS.T @ Q
+        if const_expr(not self.mma_dkv_is_rs):
+            mma_dsq_fn(
+                A_idx=smem_idx_PdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1
+            )
+        else:
+            mma_dsq_fn(tCrA=tdKrdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1)
+        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dQ)
+        cute.arch.barrier(
+            barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
+            number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+        tdQrdQaccum_flat = cute.make_tensor(acc_dQ.iterator, cute.make_layout(tdQsdQaccum.shape))
+        cute.autovec_copy(tdQrdQaccum_flat, tdQsdQaccum)
+        cute.arch.fence_view_async_shared()
+        cute.arch.barrier_arrive(
+            barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
+            number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
+        )
+        warpgroup.wait_group(0)
+        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dK)
+        pipeline_Q.consumer_release(consumer_state_Q)
+        # if cute.arch.thread_idx()[0] % 32 == 0: cute.printf("tidx = {}, m_block = {}, after pipeline_Q consumer release", cute.arch.thread_idx()[0], m_block)
+        consumer_state_Q.advance()
+        consumer_state_dO.advance()
+        return consumer_state_Q, consumer_state_dO
+    @cute.jit
+    def epilogue_dKV(
+        self,
+        acc_dV: cute.Tensor,
+        mdV: cute.Tensor,
+        sV: cute.Tensor,
+        acc_dK: cute.Tensor,
+        mdK: cute.Tensor,
+        sK: cute.Tensor,
+        seqlen: SeqlenInfoQK,
+        tma_atom_dK: cute.CopyAtom,
+        tma_atom_dV: cute.CopyAtom,
+        tiled_mma_dK: cute.TiledMma,
+        tiled_mma_dV: cute.TiledMma,
+        tidx: Int32,
+        n_block: Int32,
+        head_idx: Int32,
+        batch_idx: Int32,
+        qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+    ):
+        epi_barrier = cutlass.pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierBwd.Epilogue), num_threads=self.num_mma_threads
+        )
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        if const_expr(self.qhead_per_kvhead == 1):
+            mdV_cur = mdV[None, None, head_idx, batch_idx]
+            mdK_cur = mdK[None, None, head_idx, batch_idx]
+            gdK = cute.local_tile(mdK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
+            gdV = cute.local_tile(mdV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
+            store_dK, _, _ = copy_utils.tma_get_copy_fn(
+                tma_atom_dK, 0, cute.make_layout(1), sK, gdK, single_stage=True
+            )
+            store_dV, _, _ = copy_utils.tma_get_copy_fn(
+                tma_atom_dV, 0, cute.make_layout(1), sV, gdV, single_stage=True
+            )
+            sdV = sV if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sV)
+            sdK = sK if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sK)
+            copy_dV_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_dV, sdV, tidx, self.arch, transpose=self.dKV_swapAB
+            )
+            copy_dK_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_dK, sdK, tidx, self.arch, transpose=self.dKV_swapAB
+            )
+            cute.arch.cp_async_bulk_wait_group(1, read=True)
+            epi_barrier.arrive_and_wait()
+            copy_dV_r2s(acc_dV, dst_idx=None)
+            cute.arch.fence_view_async_shared()
+            epi_barrier.arrive_and_wait()
+            if warp_idx == 4:
+                store_dV()
+                cute.arch.cp_async_bulk_commit_group()
+            cute.arch.cp_async_bulk_wait_group(1, read=True)
+            epi_barrier.arrive_and_wait()
+            copy_dK_r2s(acc_dK, dst_idx=None)
+            cute.arch.fence_view_async_shared()
+            epi_barrier.arrive_and_wait()
+            if warp_idx == 4:
+                store_dK()
+                cute.arch.cp_async_bulk_commit_group()
+        else:
+            sdKaccum_shape0 = self.tile_n * self.tile_hdim // self.num_mma_warp_groups
+            sdVaccum_shape0 = self.tile_n * self.tile_hdimv // self.num_mma_warp_groups
+            sdKaccum_layout = cute.make_layout((sdKaccum_shape0, self.num_mma_warp_groups))
+            sdVaccum_layout = cute.make_layout((sdVaccum_shape0, self.num_mma_warp_groups))
+            head_idx_kv = head_idx // qhead_per_kvhead_divmod
+            mdKaccum_cur = mdK[None, head_idx_kv, batch_idx]
+            gdKaccum_ = cute.local_tile(mdKaccum_cur, (self.tile_n * self.tile_hdim,), (n_block,))
+            gdKaccum = cute.flat_divide(gdKaccum_, (sdKaccum_shape0,))
+            mdVaccum_cur = mdV[None, head_idx_kv, batch_idx]
+            gdVaccum_ = cute.local_tile(mdVaccum_cur, (self.tile_n * self.tile_hdimv,), (n_block,))
+            gdVaccum = cute.flat_divide(gdVaccum_, (sdVaccum_shape0,))
+            # These two overlap each other
+            sVaccum_ptr = cute.recast_ptr(sV.iterator, dtype=Float32)
+            sdKaccum = cute.make_tensor(sVaccum_ptr, sdKaccum_layout)
+            sdVaccum = cute.make_tensor(sVaccum_ptr, sdVaccum_layout)
+            tiled_copy_dKVaccum_r2s = cute.make_tiled_copy_tv(
+                cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
+                cute.make_layout((self.num_threads_per_warp_group, self.num_mma_warp_groups)),
+                cute.make_layout(128 // Float32.width),
+            )
+            thr_copy_dKVaccum_r2s = tiled_copy_dKVaccum_r2s.get_slice(tidx)
+            tdKsdKaccum = thr_copy_dKVaccum_r2s.partition_D(sdKaccum)
+            tdVsdVaccum = thr_copy_dKVaccum_r2s.partition_D(sdVaccum)
+            cute.arch.cp_async_bulk_wait_group(0, read=True)
+            epi_barrier.arrive_and_wait()
+            tdKrdKaccum_flat = cute.make_tensor(acc_dK.iterator, tdKsdKaccum.shape)
+            cute.autovec_copy(tdKrdKaccum_flat, tdKsdKaccum)
+            cute.arch.fence_view_async_shared()
+            epi_barrier.arrive_and_wait()
+            if warp_idx == 4:
+                with cute.arch.elect_one():
+                    for wg_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
+                        copy_utils.cpasync_reduce_bulk_add_f32(
+                            sdKaccum[None, wg_idx].iterator,
+                            gdKaccum[None, wg_idx].iterator,
+                            self.tma_copy_bytes["dKacc"] // self.num_mma_warp_groups,
+                        )
+                cute.arch.cp_async_bulk_commit_group()
+            cute.arch.cp_async_bulk_wait_group(0, read=True)
+            epi_barrier.arrive_and_wait()
+            tdVrdVaccum_flat = cute.make_tensor(acc_dV.iterator, tdVsdVaccum.shape)
+            cute.autovec_copy(tdVrdVaccum_flat, tdVsdVaccum)
+            cute.arch.fence_view_async_shared()
+            epi_barrier.arrive_and_wait()
+            if warp_idx == 4:
+                with cute.arch.elect_one():
+                    for wg_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
+                        copy_utils.cpasync_reduce_bulk_add_f32(
+                            sdVaccum[None, wg_idx].iterator,
+                            gdVaccum[None, wg_idx].iterator,
+                            self.tma_copy_bytes["dVacc"] // self.num_mma_warp_groups,
+                        )
+                cute.arch.cp_async_bulk_commit_group()
+    @cute.jit
+    def dQaccum_store(
+        self,
+        mdQaccum: cute.Tensor,
+        sdQaccum: cute.Tensor,
+        block_info: BlockInfo,
+        TileSchedulerCls: cutlass.Constexpr[Callable],
+        SeqlenInfoCls: cutlass.Constexpr[Callable],
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+    ):
+        tile_scheduler = TileSchedulerCls()
+        work_tile = tile_scheduler.initial_work_tile_info()
+        while work_tile.is_valid_tile:
+            n_block, head_idx, batch_idx, _ = work_tile.tile_idx
+            seqlen = SeqlenInfoCls(batch_idx)
+            mdQaccum_cur = mdQaccum[None, head_idx, batch_idx]
+            gdQaccum_ = cute.local_tile(mdQaccum_cur, (self.tile_m * self.tile_hdim,), (None,))
+            # (M * K / WG, WG, _)
+            gdQaccum = cute.flat_divide(
+                gdQaccum_, (self.tile_m * self.tile_hdim // self.num_mma_warp_groups,)
+            )
+            m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
+            if const_expr(not self.use_block_sparsity):
+                process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
+                loop_count = m_block_max - m_block_min
+            else:
+                total_block_cnt = get_total_q_block_count_bwd(
+                    blocksparse_tensors,
+                    batch_idx,
+                    head_idx,
+                    n_block,
+                    subtile_factor=self.subtile_factor,
+                    m_block_max=m_block_max,
+                )
+                process_tile = total_block_cnt > Int32(0)
+            if process_tile:
+                if const_expr(not self.use_block_sparsity):
+                    for iter_idx in cutlass.range(loop_count, unroll=1):
+                        m_block = m_block_min + iter_idx
+                        m_block_safe = m_block
+                        for warp_group_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
+                            cute.arch.cp_async_bulk_wait_group(
+                                self.num_mma_warp_groups - 1 - warp_group_idx, read=True
+                            )
+                            cute.arch.barrier_arrive(
+                                barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
+                                number_of_threads=self.num_threads_per_warp_group
+                                + cute.arch.WARP_SIZE,
+                            )
+                        for warp_group_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
+                            cute.arch.barrier(
+                                barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
+                                number_of_threads=self.num_threads_per_warp_group
+                                + cute.arch.WARP_SIZE,
+                            )
+                            with cute.arch.elect_one():
+                                copy_utils.cpasync_reduce_bulk_add_f32(
+                                    sdQaccum[None, warp_group_idx].iterator,
+                                    gdQaccum[None, warp_group_idx, m_block_safe].iterator,
+                                    self.tma_copy_bytes["dQ"],
+                                )
+                            cute.arch.cp_async_bulk_commit_group()
+                else:
+                    dQaccum_store_block_sparse_bwd_sm90(
+                        blocksparse_tensors,
+                        batch_idx,
+                        head_idx,
+                        n_block,
+                        sdQaccum,
+                        gdQaccum,
+                        subtile_factor=self.subtile_factor,
+                        m_block_max=m_block_max,
+                        num_mma_warp_groups=self.num_mma_warp_groups,
+                        num_threads_per_warp_group=self.num_threads_per_warp_group,
+                        tma_copy_bytes_dQ=self.tma_copy_bytes["dQ"],
+                    )
+            tile_scheduler.advance_to_next_work()
+            work_tile = tile_scheduler.get_current_work()
+        cute.arch.cp_async_bulk_wait_group(0, read=True)

build/torch-cuda/flash_fwd.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/flash_fwd_combine.py ADDED Viewed

	@@ -0,0 +1,692 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_fwd_combine_kernel.h
+# from Cutlass C++ to Cute-DSL.
+import math
+from typing import Type, Optional
+from functools import partial
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync
+from cutlass import Float32, Int32, const_expr
+from . import utils
+from .cute_dsl_utils import assume_tensor_aligned
+from .seqlen_info import SeqlenInfo
+from cutlass.cute import FastDivmodDivisor
+class FlashAttentionForwardCombine:
+    def __init__(
+        self,
+        dtype: Type[cutlass.Numeric],
+        dtype_partial: Type[cutlass.Numeric],
+        head_dim: int,
+        m_block_size: int = 8,
+        k_block_size: int = 64,
+        log_max_splits: int = 4,
+        num_threads: int = 256,
+        stages: int = 4,
+    ):
+        """
+        Forward combine kernel for split attention computation.
+        :param dtype: output data type
+        :param dtype_partial: partial accumulation data type
+        :param head_dim: head dimension
+        :param m_block_size: m block size
+        :param k_block_size: k block size
+        :param log_max_splits: log2 of maximum splits
+        :param num_threads: number of threads
+        :param varlen: whether using variable length sequences
+        :param stages: number of pipeline stages
+        """
+        self.dtype = dtype
+        self.dtype_partial = dtype_partial
+        self.head_dim = head_dim
+        self.m_block_size = m_block_size
+        self.k_block_size = k_block_size
+        self.max_splits = 1 << log_max_splits
+        self.num_threads = num_threads
+        self.is_even_k = head_dim % k_block_size == 0
+        self.stages = stages
+    @staticmethod
+    def can_implement(
+        dtype,
+        dtype_partial,
+        head_dim,
+        m_block_size,
+        k_block_size,
+        log_max_splits,
+        num_threads,
+    ) -> bool:
+        """Check if the kernel can be implemented with the given parameters."""
+        if dtype not in [cutlass.Float16, cutlass.BFloat16, cutlass.Float32]:
+            return False
+        if dtype_partial not in [cutlass.Float16, cutlass.BFloat16, Float32]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        if m_block_size % 8 != 0:
+            return False
+        max_splits = 1 << log_max_splits
+        if max_splits > 256:
+            return False
+        if (m_block_size * max_splits) % num_threads != 0:
+            return False
+        return True
+    def _setup_attributes(self):
+        # GMEM copy setup for O partial
+        universal_copy_bits = 128
+        async_copy_elems = universal_copy_bits // self.dtype_partial.width
+        assert self.k_block_size % async_copy_elems == 0
+        k_block_gmem = (
+            128 if self.k_block_size % 128 == 0 else (64 if self.k_block_size % 64 == 0 else 32)
+        )
+        gmem_threads_per_row = k_block_gmem // async_copy_elems
+        assert self.num_threads % gmem_threads_per_row == 0
+        # Async copy atom for O partial load
+        atom_async_copy_partial = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            self.dtype_partial,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        tOpartial_layout = cute.make_ordered_layout(
+            (self.num_threads // gmem_threads_per_row, gmem_threads_per_row),
+            order=(1, 0),
+        )
+        vOpartial_layout = cute.make_layout((1, async_copy_elems))  # 4 vals per load
+        self.gmem_tiled_copy_O_partial = cute.make_tiled_copy_tv(
+            atom_async_copy_partial, tOpartial_layout, vOpartial_layout
+        )
+        # GMEM copy setup for final O (use universal copy for store)
+        atom_universal_copy = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            self.dtype,
+            num_bits_per_copy=async_copy_elems * self.dtype.width,
+        )
+        self.gmem_tiled_copy_O = cute.make_tiled_copy_tv(
+            atom_universal_copy,
+            tOpartial_layout,
+            vOpartial_layout,  # 4 vals per store
+        )
+        # LSE copy setup with async copy (alignment = 1)
+        lse_copy_bits = Float32.width  # 1 element per copy, width is in bits
+        m_block_smem = (
+            128
+            if self.m_block_size % 128 == 0
+            else (
+                64
+                if self.m_block_size % 64 == 0
+                else (
+                    32
+                    if self.m_block_size % 32 == 0
+                    else (16 if self.m_block_size % 16 == 0 else 8)
+                )
+            )
+        )
+        gmem_threads_per_row_lse = m_block_smem
+        assert self.num_threads % gmem_threads_per_row_lse == 0
+        # Async copy atom for LSE load
+        atom_async_copy_lse = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.ALWAYS),
+            Float32,
+            num_bits_per_copy=lse_copy_bits,
+        )
+        tLSE_layout = cute.make_ordered_layout(
+            (self.num_threads // gmem_threads_per_row_lse, gmem_threads_per_row_lse),
+            order=(1, 0),
+        )
+        vLSE_layout = cute.make_layout(1)
+        self.gmem_tiled_copy_LSE = cute.make_tiled_copy_tv(
+            atom_async_copy_lse, tLSE_layout, vLSE_layout
+        )
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Shared memory to register copy for LSE
+        self.smem_threads_per_col_lse = self.num_threads // m_block_smem
+        assert 32 % self.smem_threads_per_col_lse == 0  # Must divide warp size
+        s2r_layout_atom_lse = cute.make_ordered_layout(
+            (self.smem_threads_per_col_lse, self.num_threads // self.smem_threads_per_col_lse),
+            order=(0, 1),
+        )
+        self.s2r_tiled_copy_LSE = cute.make_tiled_copy_tv(
+            cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32),
+            s2r_layout_atom_lse,
+            cute.make_layout(1),
+        )
+        # LSE shared memory layout with swizzling to avoid bank conflicts
+        # This works for kBlockMSmem = 8, 16, 32, 64, 128, no bank conflicts
+        if const_expr(m_block_smem == 8):
+            smem_lse_swizzle = cute.make_swizzle(5, 0, 5)
+        elif const_expr(m_block_smem == 16):
+            smem_lse_swizzle = cute.make_swizzle(4, 0, 4)
+        else:
+            smem_lse_swizzle = cute.make_swizzle(3, 2, 3)
+        smem_layout_atom_lse = cute.make_composed_layout(
+            smem_lse_swizzle, 0, cute.make_ordered_layout((8, m_block_smem), order=(1, 0))
+        )
+        self.smem_layout_lse = cute.tile_to_shape(
+            smem_layout_atom_lse, (self.max_splits, self.m_block_size), (0, 1)
+        )
+        # O partial shared memory layout (simple layout for pipeline stages)
+        self.smem_layout_o = cute.make_ordered_layout(
+            (self.m_block_size, self.k_block_size, self.stages), order=(1, 0, 2)
+        )
+    @cute.jit
+    def __call__(
+        self,
+        mO_partial: cute.Tensor,
+        mLSE_partial: cute.Tensor,
+        mO: cute.Tensor,
+        mLSE: Optional[cute.Tensor] = None,
+        cu_seqlens: Optional[cute.Tensor] = None,
+        seqused: Optional[cute.Tensor] = None,
+        num_splits_dynamic_ptr: Optional[cute.Tensor] = None,
+        semaphore_to_reset: Optional[cute.Tensor] = None,
+        stream: cuda.CUstream = None,
+    ):
+        # Type checking
+        if const_expr(not (mO_partial.element_type == self.dtype_partial)):
+            raise TypeError("O partial tensor must match dtype_partial")
+        if const_expr(not (mO.element_type == self.dtype)):
+            raise TypeError("O tensor must match dtype")
+        if const_expr(mLSE_partial.element_type not in [Float32]):
+            raise TypeError("LSE partial tensor must be Float32")
+        if const_expr(mLSE is not None and mLSE.element_type not in [Float32]):
+            raise TypeError("LSE tensor must be Float32")
+        # Shape validation - input tensors are in user format, need to be converted to kernel format
+        if const_expr(len(mO_partial.shape) not in [4, 5]):
+            raise ValueError(
+                "O partial tensor must have 4 or 5 dimensions: (num_splits, batch, seqlen, nheads, headdim) or (num_splits, total_q, nheads, headdim)"
+            )
+        if const_expr(len(mLSE_partial.shape) not in [3, 4]):
+            raise ValueError(
+                "LSE partial tensor must have 3 or 4 dimensions: (num_splits, batch, seqlen, nheads) or (num_splits, total_q, nheads)"
+            )
+        if const_expr(len(mO.shape) not in [3, 4]):
+            raise ValueError(
+                "O tensor must have 3 or 4 dimensions: (batch, seqlen, nheads, headdim) or (total_q, nheads, headdim)"
+            )
+        if const_expr(mLSE is not None and len(mLSE.shape) not in [2, 3]):
+            raise ValueError(
+                "LSE tensor must have 2 or 3 dimensions: (batch, seqlen, nheads) or (total_q, nheads)"
+            )
+        mO_partial, mO = [assume_tensor_aligned(t) for t in (mO_partial, mO)]
+        # (num_splits, b, seqlen, h, d) -> (seqlen, d, num_splits, h, b)
+        # or (num_splits, total_q, h, d) -> (total_q, d, num_splits, h)
+        O_partial_layout_transpose = (
+            [2, 4, 0, 3, 1] if const_expr(cu_seqlens is None) else [1, 3, 0, 2]
+        )
+        # (b, seqlen, h, d) -> (seqlen, d, h, b) or (total_q, h, d) -> (total_q, d, h)
+        mO_partial = cute.make_tensor(
+            mO_partial.iterator, cute.select(mO_partial.layout, mode=O_partial_layout_transpose)
+        )
+        O_layout_transpose = [1, 3, 2, 0] if const_expr(cu_seqlens is None) else [0, 2, 1]
+        mO = cute.make_tensor(mO.iterator, cute.select(mO.layout, mode=O_layout_transpose))
+        # (num_splits, b, seqlen, h) -> (seqlen, num_splits, h, b)
+        # or (num_splits, total_q, h) -> (total_q, num_splits, h)
+        LSE_partial_layout_transpose = [2, 0, 3, 1] if const_expr(cu_seqlens is None) else [1, 0, 2]
+        mLSE_partial = cute.make_tensor(
+            mLSE_partial.iterator,
+            cute.select(mLSE_partial.layout, mode=LSE_partial_layout_transpose),
+        )
+        # (b, seqlen, h) -> (seqlen, h, b) or (total_q, h) -> (total_q, h)
+        LSE_layout_transpose = [1, 2, 0] if const_expr(cu_seqlens is None) else [0, 1]
+        mLSE = (
+            cute.make_tensor(mLSE.iterator, cute.select(mLSE.layout, mode=LSE_layout_transpose))
+            if mLSE is not None
+            else None
+        )
+        # Determine if we have variable length sequences
+        varlen = const_expr(cu_seqlens is not None or seqused is not None)
+        self._setup_attributes()
+        @cute.struct
+        class SharedStorage:
+            sLSE: cute.struct.Align[
+                cute.struct.MemRange[Float32, cute.cosize(self.smem_layout_lse)], 128
+            ]
+            sMaxValidSplit: cute.struct.Align[cute.struct.MemRange[Int32, self.m_block_size], 128]
+            sO: cute.struct.Align[
+                cute.struct.MemRange[self.dtype_partial, cute.cosize(self.smem_layout_o)], 128
+            ]
+        smem_size = SharedStorage.size_in_bytes()
+        # Grid dimensions: (ceil_div(seqlen, m_block), ceil_div(head_dim, k_block), num_head * batch)
+        seqlen = mO_partial.shape[0]
+        num_head = mO_partial.shape[3]
+        batch_size = (
+            mO_partial.shape[4]
+            if const_expr(cu_seqlens is None)
+            else Int32(cu_seqlens.shape[0] - 1)
+        )
+        # Create FastDivmodDivisor objects for efficient division
+        seqlen_divmod = FastDivmodDivisor(seqlen)
+        head_divmod = FastDivmodDivisor(num_head)
+        grid_dim = (
+            cute.ceil_div(seqlen * num_head, self.m_block_size),
+            cute.ceil_div(self.head_dim, self.k_block_size),
+            batch_size,
+        )
+        self.kernel(
+            mO_partial,
+            mLSE_partial,
+            mO,
+            mLSE,
+            cu_seqlens,
+            seqused,
+            num_splits_dynamic_ptr,
+            semaphore_to_reset,
+            SharedStorage,
+            self.smem_layout_lse,
+            self.smem_layout_o,
+            self.gmem_tiled_copy_O_partial,
+            self.gmem_tiled_copy_O,
+            self.gmem_tiled_copy_LSE,
+            self.s2r_tiled_copy_LSE,
+            seqlen_divmod,
+            head_divmod,
+            varlen,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            smem=smem_size,
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mO_partial: cute.Tensor,
+        mLSE_partial: cute.Tensor,
+        mO: cute.Tensor,
+        mLSE: Optional[cute.Tensor],
+        cu_seqlens: Optional[cute.Tensor],
+        seqused: Optional[cute.Tensor],
+        num_splits_dynamic_ptr: Optional[cute.Tensor],
+        semaphore_to_reset: Optional[cute.Tensor],
+        SharedStorage: cutlass.Constexpr,
+        smem_layout_lse: cute.Layout | cute.ComposedLayout,
+        smem_layout_o: cute.Layout,
+        gmem_tiled_copy_O_partial: cute.TiledCopy,
+        gmem_tiled_copy_O: cute.TiledCopy,
+        gmem_tiled_copy_LSE: cute.TiledCopy,
+        s2r_tiled_copy_LSE: cute.TiledCopy,
+        seqlen_divmod: FastDivmodDivisor,
+        head_divmod: FastDivmodDivisor,
+        varlen: cutlass.Constexpr[bool],
+    ):
+        # Thread and block indices
+        tidx, _, _ = cute.arch.thread_idx()
+        m_block, k_block, batch_idx = cute.arch.block_idx()
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get shared memory buffer
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(SharedStorage)
+        sLSE = storage.sLSE.get_tensor(smem_layout_lse)
+        sMaxValidSplit = storage.sMaxValidSplit.get_tensor((self.m_block_size,))
+        sO = storage.sO.get_tensor(smem_layout_o)
+        # Handle semaphore reset
+        if const_expr(semaphore_to_reset is not None):
+            if (
+                tidx == 0
+                and m_block == cute.arch.grid_dim()[0] - 1
+                and k_block == cute.arch.grid_dim()[1] - 1
+                and batch_idx == cute.arch.grid_dim()[2] - 1
+            ):
+                semaphore_to_reset[0] = 0
+        # Get number of splits
+        num_splits = (
+            num_splits_dynamic_ptr[batch_idx]
+            if const_expr(num_splits_dynamic_ptr is not None)
+            else mLSE_partial.shape[1]
+        )
+        # Handle variable length sequences using SeqlenInfo
+        seqlen_info = SeqlenInfo.create(
+            batch_idx=batch_idx,
+            seqlen_static=mO_partial.shape[0],
+            cu_seqlens=cu_seqlens,
+            seqused=seqused,
+        )
+        seqlen, offset = seqlen_info.seqlen, seqlen_info.offset
+        # Extract number of heads (head index will be determined dynamically)
+        num_head = mO_partial.shape[3]
+        max_idx = seqlen * num_head
+        # Early exit for single split if dynamic
+        if (const_expr(num_splits_dynamic_ptr is None) or num_splits > 1) and (
+            const_expr(not varlen) or m_block * self.m_block_size < max_idx
+        ):
+            # ===============================
+            # Step 1: Load LSE_partial from gmem to shared memory
+            # ===============================
+            if const_expr(cu_seqlens is None):
+                mLSE_partial_cur = mLSE_partial[None, None, None, batch_idx]
+            else:
+                mLSE_partial_cur = cute.domain_offset((offset, 0, 0), mLSE_partial)
+            mLSE_partial_copy = cute.tiled_divide(mLSE_partial_cur, (1,))
+            gmem_thr_copy_LSE = gmem_tiled_copy_LSE.get_slice(tidx)
+            tLSEsLSE = gmem_thr_copy_LSE.partition_D(sLSE)
+            # Create identity tensor for coordinate tracking
+            cLSE = cute.make_identity_tensor((self.max_splits, self.m_block_size))
+            tLSEcLSE = gmem_thr_copy_LSE.partition_S(cLSE)
+            # Load LSE partial values
+            for m in cutlass.range(cute.size(tLSEcLSE, mode=[2]), unroll_full=True):
+                mi = tLSEcLSE[0, 0, m][1]  # Get m coordinate
+                idx = m_block * self.m_block_size + mi
+                if idx < max_idx:
+                    # Calculate actual sequence position and head using FastDivmodDivisor
+                    if const_expr(not varlen):
+                        head_idx, m_idx = divmod(idx, seqlen_divmod)
+                    else:
+                        head_idx = idx // seqlen
+                        m_idx = idx - head_idx * seqlen
+                    mLSE_partial_cur_copy = mLSE_partial_copy[None, m_idx, None, head_idx]
+                    for s in cutlass.range(cute.size(tLSEcLSE, mode=[1]), unroll_full=True):
+                        si = tLSEcLSE[0, s, 0][0]  # Get split coordinate
+                        if si < num_splits:
+                            cute.copy(
+                                gmem_thr_copy_LSE,
+                                mLSE_partial_cur_copy[None, si],
+                                tLSEsLSE[None, s, m],
+                            )
+                        else:
+                            tLSEsLSE[None, s, m].fill(-Float32.inf)
+                # Don't need to zero out the rest of the LSEs, as we will not write the output to gmem
+            cute.arch.cp_async_commit_group()
+            # ===============================
+            # Step 2: Load O_partial for pipeline stages
+            # ===============================
+            gmem_thr_copy_O_partial = gmem_tiled_copy_O_partial.get_slice(tidx)
+            cO = cute.make_identity_tensor((self.m_block_size, self.k_block_size))
+            tOcO = gmem_thr_copy_O_partial.partition_D(cO)
+            tOsO_partial = gmem_thr_copy_O_partial.partition_D(sO)
+            if const_expr(cu_seqlens is None):
+                mO_partial_cur = mO_partial[None, None, None, None, batch_idx]
+            else:
+                mO_partial_cur = cute.domain_offset((offset, 0, 0, 0), mO_partial)
+            # Precompute these values to avoid recomputing them in the loop
+            num_rows = const_expr(cute.size(tOcO, mode=[1]))
+            tOmidx = cute.make_fragment(num_rows, cutlass.Int32)
+            tOhidx = cute.make_fragment(num_rows, cutlass.Int32)
+            tOrOptr = cute.make_fragment(num_rows, cutlass.Int64)
+            for m in cutlass.range(num_rows, unroll_full=True):
+                mi = tOcO[0, m, 0][0]  # m coordinate
+                idx = m_block * self.m_block_size + mi
+                if const_expr(not varlen):
+                    tOhidx[m], tOmidx[m] = divmod(idx, seqlen_divmod)
+                else:
+                    tOhidx[m] = idx // seqlen
+                    tOmidx[m] = idx - tOhidx[m] * seqlen
+                tOrOptr[m] = utils.elem_pointer(
+                    mO_partial_cur, (tOmidx[m], k_block * self.k_block_size, 0, tOhidx[m])
+                ).toint()
+                if idx >= max_idx:
+                    tOhidx[m] = -1
+            tOpO = cute.make_fragment(cute.size(tOcO, [2]), cutlass.Boolean)
+            if const_expr(not self.is_even_k):
+                for k in cutlass.range(cute.size(tOpO), unroll_full=True):
+                    tOpO[k] = tOcO[0, 0, k][1] < mO_partial.shape[1] - k_block * self.k_block_size
+            # if cute.arch.thread_idx()[0] == 0 and k_block == 1: cute.print_tensor(tOpO)
+            load_O_partial = partial(
+                self.load_O_partial,
+                gmem_tiled_copy_O_partial,
+                tOrOptr,
+                tOsO_partial,
+                tOhidx,
+                tOpO,
+                tOcO,
+                mO_partial_cur.layout,
+            )
+            # Load first few stages of O_partial
+            for stage in cutlass.range(self.stages - 1, unroll_full=True):
+                if stage < num_splits:
+                    load_O_partial(stage, stage)
+                cute.arch.cp_async_commit_group()
+            # ===============================
+            # Step 3: Load and transpose LSE from smem to registers
+            # ===============================
+            # Wait for LSE and initial O partial stages to complete
+            cute.arch.cp_async_wait_group(self.stages - 1)
+            cute.arch.sync_threads()
+            # if cute.arch.thread_idx()[0] == 0:
+            #     # cute.print_tensor(sLSE)
+            #     for i in range(64):
+            #         cute.printf("sLSE[%d, 0] = %f", i, sLSE[i, 0])
+            # cute.arch.sync_threads()
+            s2r_thr_copy_LSE = s2r_tiled_copy_LSE.get_slice(tidx)
+            ts2rsLSE = s2r_thr_copy_LSE.partition_S(sLSE)
+            ts2rrLSE = cute.make_fragment_like(ts2rsLSE)
+            cute.copy(s2r_tiled_copy_LSE, ts2rsLSE, ts2rrLSE)
+            # ===============================
+            # Step 4: Compute final LSE along split dimension
+            # ===============================
+            lse_sum = cute.make_fragment(cute.size(ts2rrLSE, mode=[2]), Float32)
+            ts2rcLSE = s2r_thr_copy_LSE.partition_D(cLSE)
+            # We compute the max valid split for each row to short-circuit the computation later
+            max_valid_split = cute.make_fragment(cute.size(ts2rrLSE, mode=[2]), Int32)
+            assert cute.size(ts2rrLSE, mode=[0]) == 1
+            # Compute max, scales, and final LSE for each row
+            for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
+                # Find max LSE value across splits
+                threads_per_col = const_expr(self.smem_threads_per_col_lse)
+                lse_max = cute.arch.warp_reduction_max(
+                    ts2rrLSE[None, None, m]
+                    .load()
+                    .reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
+                    threads_in_group=threads_per_col,
+                )
+                # if cute.arch.thread_idx()[0] == 0: cute.printf(lse_max)
+                # Find max valid split index
+                max_valid_idx = -1
+                for s in cutlass.range(cute.size(ts2rrLSE, mode=[1]), unroll_full=True):
+                    if ts2rrLSE[0, s, m] != -Float32.inf:
+                        max_valid_idx = ts2rcLSE[0, s, 0][0]  # Get split coordinate
+                # if cute.arch.thread_idx()[0] < 32: cute.printf(max_valid_idx)
+                max_valid_split[m] = cute.arch.warp_reduction_max(
+                    max_valid_idx, threads_in_group=threads_per_col
+                )
+                # Compute exp scales and sum
+                lse_max_cur = (
+                    0.0 if lse_max == -Float32.inf else lse_max
+                )  # In case all local LSEs are -inf
+                LOG2_E = math.log2(math.e)
+                lse_sum_cur = 0.0
+                for s in cutlass.range(cute.size(ts2rrLSE, mode=[1]), unroll_full=True):
+                    scale = cute.math.exp2(
+                        ts2rrLSE[0, s, m] * LOG2_E - (lse_max_cur * LOG2_E), fastmath=True
+                    )
+                    lse_sum_cur += scale
+                    ts2rrLSE[0, s, m] = scale  # Store scale for later use
+                lse_sum_cur = cute.arch.warp_reduction_sum(
+                    lse_sum_cur, threads_in_group=threads_per_col
+                )
+                lse_sum[m] = cute.math.log(lse_sum_cur, fastmath=True) + lse_max
+                # Normalize scales
+                inv_sum = (
+                    0.0 if (lse_sum_cur == 0.0 or lse_sum_cur != lse_sum_cur) else 1.0 / lse_sum_cur
+                )
+                ts2rrLSE[None, None, m].store(ts2rrLSE[None, None, m].load() * inv_sum)
+            # Store the scales exp(lse - lse_logsum) back to smem
+            cute.copy(s2r_tiled_copy_LSE, ts2rrLSE, ts2rsLSE)
+            # Store max valid split to smem
+            for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
+                if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
+                    mi = ts2rcLSE[0, 0, m][1]
+                    if mi < self.m_block_size:
+                        sMaxValidSplit[mi] = max_valid_split[m]
+            # ===============================
+            # Step 5: Store final LSE to gmem
+            # ===============================
+            if const_expr(mLSE is not None):
+                if const_expr(cu_seqlens is None):
+                    mLSE_cur = mLSE[None, None, batch_idx]
+                else:
+                    mLSE_cur = cute.domain_offset((offset, 0), mLSE)
+                if k_block == 0:  # Only first k_block writes LSE when mLSE is provided
+                    for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
+                        if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
+                            mi = ts2rcLSE[0, 0, m][1]
+                            idx = m_block * self.m_block_size + mi
+                            if idx < max_idx:
+                                if const_expr(not varlen):
+                                    head_idx, m_idx = divmod(idx, seqlen_divmod)
+                                else:
+                                    head_idx = idx // seqlen
+                                    m_idx = idx - head_idx * seqlen
+                                mLSE_cur[m_idx, head_idx] = lse_sum[m]
+            # ===============================
+            # Step 6: Read O_partial and accumulate final O
+            # ===============================
+            cute.arch.sync_threads()
+            # Get max valid split for this thread
+            thr_max_valid_split = sMaxValidSplit[tOcO[0, 0, 0][0]]
+            for m in cutlass.range(1, cute.size(tOcO, mode=[1])):
+                thr_max_valid_split = max(thr_max_valid_split, sMaxValidSplit[tOcO[0, m, 0][0]])
+            tOrO_partial = cute.make_fragment_like(tOsO_partial[None, None, None, 0])
+            tOrO = cute.make_fragment_like(tOrO_partial, Float32)
+            tOrO.fill(0.0)
+            stage_load = self.stages - 1
+            stage_compute = 0
+            # Main accumulation loop
+            for s in cutlass.range(thr_max_valid_split + 1, unroll=4):
+                # Get scales for this split
+                scale = cute.make_fragment(num_rows, Float32)
+                for m in cutlass.range(num_rows, unroll_full=True):
+                    scale[m] = sLSE[s, tOcO[0, m, 0][0]]  # Get scale from smem
+                # Load next stage if needed
+                split_to_load = s + self.stages - 1
+                if split_to_load <= thr_max_valid_split:
+                    load_O_partial(split_to_load, stage_load)
+                cute.arch.cp_async_commit_group()
+                stage_load = 0 if stage_load == self.stages - 1 else stage_load + 1
+                # Wait for the current stage to be ready
+                cute.arch.cp_async_wait_group(self.stages - 1)
+                # We don't need __syncthreads() because each thread is just reading its own data from smem
+                # Copy from smem to registers
+                cute.autovec_copy(tOsO_partial[None, None, None, stage_compute], tOrO_partial)
+                stage_compute = 0 if stage_compute == self.stages - 1 else stage_compute + 1
+                # Accumulate scaled partial results
+                for m in cutlass.range(num_rows, unroll_full=True):
+                    if tOhidx[m] >= 0 and scale[m] > 0.0:
+                        tOrO[None, m, None].store(
+                            tOrO[None, m, None].load()
+                            + scale[m] * tOrO_partial[None, m, None].load().to(Float32)
+                        )
+            # ===============================
+            # Step 7: Write final O to gmem
+            # ===============================
+            rO = cute.make_fragment_like(tOrO, self.dtype)
+            rO.store(tOrO.load().to(self.dtype))
+            if const_expr(cu_seqlens is None):
+                mO_cur = mO[None, None, None, batch_idx]
+            else:
+                mO_cur = cute.domain_offset((offset, 0, 0), mO)
+            mO_cur = utils.domain_offset_aligned((0, k_block * self.k_block_size, 0), mO_cur)
+            elems_per_store = const_expr(cute.size(gmem_tiled_copy_O.layout_tv_tiled[1]))
+            # mO_cur_copy = cute.tiled_divide(mO_cur, (1, elems_per_store,))
+            gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
+            # Write final results
+            for m in cutlass.range(num_rows, unroll_full=True):
+                if tOhidx[m] >= 0:
+                    mO_cur_copy = cute.tiled_divide(
+                        mO_cur[tOmidx[m], None, tOhidx[m]], (elems_per_store,)
+                    )
+                    for k in cutlass.range(cute.size(tOcO, mode=[2]), unroll_full=True):
+                        k_idx = tOcO[0, 0, k][1] // elems_per_store
+                        if const_expr(self.is_even_k) or tOpO[k]:
+                            cute.copy(gmem_thr_copy_O, rO[None, m, k], mO_cur_copy[None, k_idx])
+    @cute.jit
+    def load_O_partial(
+        self,
+        gmem_tiled_copy_O_partial: cute.TiledCopy,
+        tOrOptr: cute.Tensor,
+        tOsO_partial: cute.Tensor,
+        tOhidx: cute.Tensor,
+        tOpO: cute.Tensor,
+        tOcO: cute.Tensor,
+        mO_cur_partial_layout: cute.Layout,
+        split: Int32,
+        stage: Int32,
+    ) -> None:
+        elems_per_load = const_expr(cute.size(gmem_tiled_copy_O_partial.layout_tv_tiled[1]))
+        tOsO_partial_cur = tOsO_partial[None, None, None, stage]
+        for m in cutlass.range(cute.size(tOcO, [1]), unroll_full=True):
+            if tOhidx[m] >= 0:
+                o_gmem_ptr = cute.make_ptr(
+                    tOsO_partial.element_type, tOrOptr[m], cute.AddressSpace.gmem, assumed_align=16
+                )
+                mO_partial_cur = cute.make_tensor(
+                    o_gmem_ptr, cute.slice_(mO_cur_partial_layout, (0, None, None, 0))
+                )
+                mO_partial_cur_copy = cute.tiled_divide(mO_partial_cur, (elems_per_load,))
+                for k in cutlass.range(cute.size(tOcO, mode=[2]), unroll_full=True):
+                    k_idx = tOcO[0, 0, k][1] // elems_per_load
+                    if const_expr(self.is_even_k) or tOpO[k]:
+                        cute.copy(
+                            gmem_tiled_copy_O_partial,
+                            mO_partial_cur_copy[None, k_idx, split],
+                            tOsO_partial_cur[None, m, k],
+                        )

build/torch-cuda/flash_fwd_sm100.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/interface.py ADDED Viewed

	@@ -0,0 +1,1855 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# [2025-07-04] Version in Cute-DSL, for Hopper and Blackwell. You'll need install nvidia-cutlass-dsl==4.2.0.
+# Supported features:
+# - BF16 & FP16 dtype
+# - noncausal & causal attention
+# - MHA, GQA, MQA
+# - hdim 64, 96, 128.
+# - (hdim_qk, hdim_v) = (192, 128) for Blackwell (i.e. DeepSeek shape)
+# - varlen
+# - sliding window
+# - bwd pass for Ampere (will also run on Hopper/Blackwell, but will be slow)
+# Features not supported yet:
+# - split (i.e. FlashDecoding)
+# - tuned block sizes
+# - paged KV
+# - append KV to existing KV cache
+# - FP8
+# - bwd pass optimized for Hopper/Blackwell
+import os
+import math
+from functools import lru_cache
+from typing import Optional, Tuple, Callable
+import torch
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from .cache_utils import get_jit_cache
+from .testing import is_fake_mode
+if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
+    from . import cute_dsl_ptxas  # noqa: F401
+    # Patch to dump ptx and then use system ptxas to compile to cubin
+    cute_dsl_ptxas.patch()
+from . import utils
+from .cute_dsl_utils import (
+    to_cute_tensor, to_cute_aux_tensor, get_aux_tensor_metadata, get_broadcast_dims,
+)
+from .flash_fwd import FlashAttentionForwardSm90
+from .flash_fwd_sm100 import FlashAttentionForwardSm100
+from .flash_bwd_preprocess import FlashAttentionBackwardPreprocess
+from .flash_bwd import FlashAttentionBackwardSm80
+from .flash_bwd_sm90 import FlashAttentionBackwardSm90
+from .flash_bwd_sm100 import FlashAttentionBackwardSm100
+from .flash_bwd_postprocess import FlashAttentionBackwardPostprocess
+from .flash_fwd_combine import FlashAttentionForwardCombine
+from .block_sparsity import (
+    BlockSparseTensorsTorch,
+    to_cute_block_sparse_tensors,
+    normalize_block_sparse_config,
+    normalize_block_sparse_config_bwd,
+)
+@lru_cache(maxsize=None)
+def _get_device_arch():
+    """Cached device arch check."""
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def _validate_tensor(t, name, expected_shape, expected_dtype, expected_device):
+    assert t.shape == expected_shape, f"{name} shape {t.shape} != expected {expected_shape}"
+    assert t.dtype == expected_dtype, f"{name} dtype {t.dtype} != expected {expected_dtype}"
+    assert t.device == expected_device, f"{name} device {t.device} != expected {expected_device}"
+    assert t.is_cuda, f"{name} must be on CUDA"
+torch2cute_dtype_map = {
+    torch.float16: cutlass.Float16,
+    torch.bfloat16: cutlass.BFloat16,
+    torch.float32: cutlass.Float32,
+}
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, max_splits):
+    # If num_n_blocks is too small, use 1 split. For example, we never split for hdim = 128 and seqlen_k = 512.
+    if num_n_blocks <= 4:
+        return 1
+    # NOTE: We should revisit this heuristic after persistence is supported for split KV.
+    # Sometimes, it's ideal to over-schedule splits for better efficiency.
+    return min(num_SMs // total_mblocks, max_splits, num_n_blocks)
+def _flash_attn_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    page_table: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    softcap: Optional[float] = None,
+    window_size_left: Optional[int] = None,
+    window_size_right: Optional[int] = None,
+    learnable_sink: Optional[torch.Tensor] = None,
+    # m_block_size: int = 128,
+    # n_block_size: int = 64,
+    # num_threads: int = 128,
+    m_block_size: int = 128,
+    n_block_size: int = 128,
+    num_threads: int = 384,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    _arch: Optional[int] = None,
+    score_mod: Optional[Callable] = None,
+    mask_mod: Optional[Callable] = None,
+    block_sparse_tensors: Optional[BlockSparseTensorsTorch] = None,
+    return_lse: bool = False,
+    out: Optional[torch.Tensor] = None,
+    lse: Optional[torch.Tensor] = None,
+    aux_tensors: Optional[list[torch.Tensor]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Forward pass for FlashAttention.
+    Args:
+        ...
+        score_mod: A callable that takes the attention scores and applies a modification.
+        mask_mod: A callable that takes token position information and selectively masks
+        block_sparse_tensors: A tuple of tensors used for block sparsity.
+        return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
+            Note: the returned LSE currently does not support taking gradient.
+        out: Optional pre-allocated output tensor. If None, will be allocated internally.
+        lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
+        aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
+    """
+    q, k, v = [maybe_contiguous(t) for t in (q, k, v)]
+    num_head, head_dim = q.shape[-2:]
+    if cu_seqlens_q is None:
+        batch_size, seqlen_q = q.shape[:2]
+        total_q = batch_size * seqlen_q
+    else:
+        batch_size = cu_seqlens_q.shape[0] - 1
+        seqlen_q = None
+        total_q = q.shape[0]
+    if page_table is not None:
+        assert cu_seqlens_k is None, "page_table is not supported with cu_seqlens_k"
+        assert page_table.dtype == torch.int32, "page_table must be int32"
+        assert page_table.stride(-1) == 1, "page_table must be contiguous in the last dimension"
+        max_num_pages_per_seq = page_table.shape[1]
+        assert page_table.shape == (batch_size, max_num_pages_per_seq)
+        num_pages, page_size = k.shape[:2]
+        seqlen_k = num_pages * page_size
+    else:
+        num_pages, page_size = None, None
+        seqlen_k = k.shape[-3]
+    num_head_kv = k.shape[-2]
+    head_dim_v = v.shape[-1]
+    if cu_seqlens_k is None:
+        if page_table is None:
+            assert k.shape == (batch_size, seqlen_k, num_head_kv, head_dim)
+            assert v.shape == (batch_size, seqlen_k, num_head_kv, head_dim_v)
+        else:
+            assert k.shape == (num_pages, page_size, num_head_kv, head_dim)
+            assert v.shape == (num_pages, page_size, num_head_kv, head_dim_v)
+    else:
+        assert k.shape == (seqlen_k, num_head_kv, head_dim)
+        assert v.shape == (seqlen_k, num_head_kv, head_dim_v)
+        assert cu_seqlens_k.shape == (batch_size + 1,), (
+            "cu_seqlens_k must have shape (batch_size + 1,)"
+        )
+    if cu_seqlens_q is not None:
+        assert cu_seqlens_q.shape == (batch_size + 1,), (
+            "cu_seqlens_q must have shape (batch_size + 1,)"
+        )
+    assert seqused_q is None or seqused_q.shape == (batch_size,), (
+        "seqused_q must have shape (batch_size,)"
+    )
+    assert seqused_k is None or seqused_k.shape == (batch_size,), (
+        "seqused_k must have shape (batch_size,)"
+    )
+    assert q.dtype in [torch.float16, torch.bfloat16], "inputs must be float16 or bfloat16"
+    assert q.dtype == k.dtype == v.dtype, "inputs must have the same dtype"
+    for t in [cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k]:
+        if t is not None:
+            assert t.dtype == torch.int32, (
+                "cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32"
+            )
+            assert t.stride(0) == 1, (
+                "cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguous"
+            )
+    if learnable_sink is not None:
+        assert learnable_sink.shape == (num_head,)
+        assert learnable_sink.dtype == torch.bfloat16, "learnable_sink must be bfloat16"
+    assert all(
+        t is None or t.is_cuda
+        for t in (
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            page_table,
+            learnable_sink,
+        )
+    ), "inputs must be on CUDA device"
+    assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
+    assert head_dim <= 256, "head_dim must be less than or equal to 256"
+    alignment = 16 // q.element_size()
+    assert head_dim % alignment == 0, f"head_dim must be divisible by {alignment}"
+    assert head_dim_v % alignment == 0, f"head_dim_v must be divisible by {alignment}"
+    if softmax_scale is None:
+        softmax_scale = 1.0 / math.sqrt(head_dim)
+    if softcap == 0.0:
+        softcap = None
+    qhead_per_kvhead = num_head // num_head_kv
+    if pack_gqa is None:
+        pack_gqa = qhead_per_kvhead > 1
+    out_torch_dtype = q.dtype
+    device = q.device
+    q_batch_seqlen_shape = (batch_size, seqlen_q) if cu_seqlens_q is None else (total_q,)
+    lse_shape = (batch_size, num_head, seqlen_q) if cu_seqlens_q is None else (num_head, total_q)
+    requires_grad = q.requires_grad or k.requires_grad or v.requires_grad
+    if out is None:
+        out = torch.empty(
+            *q_batch_seqlen_shape, num_head, head_dim_v, dtype=out_torch_dtype, device=device
+        )
+    else:
+        _validate_tensor(out, "out", (*q_batch_seqlen_shape, num_head, head_dim_v), out_torch_dtype, device)
+    if lse is None:
+        lse = (
+            torch.empty(lse_shape, dtype=torch.float32, device=device)
+            if requires_grad or return_lse
+            else None
+        )
+    elif lse is not None:
+        _validate_tensor(lse, "lse", lse_shape, torch.float32, device)
+    dtype = torch2cute_dtype_map[q.dtype]
+    arch = _get_device_arch() if _arch is None else _arch
+    assert arch // 10 in [9, 10, 11], "Unsupported compute capability. Supported: 9.x, 10.x, 11.x"
+    use_block_sparsity = block_sparse_tensors is not None
+    if mask_mod is None:
+        if causal:
+            window_size_right = 0
+        if window_size_left is not None and window_size_right is not None and window_size_left + window_size_right < 0:
+            window_size_left = None
+            window_size_right = None
+        local = window_size_left is not None or window_size_right is not None
+        if window_size_left is not None or window_size_right is not None:
+            if window_size_left is None and window_size_right == 0:
+                causal, local = True, False
+                window_size_right = None
+            else:
+                causal, local = False, True
+    else:
+        causal, local = False, False
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    if arch // 10 == 9:  # TODO: tune block size according to hdim.
+        if head_dim == head_dim_v == 128 and not causal and not local and not use_block_sparsity:
+            n_block_size = 192
+    if arch // 10 in [10, 11]:
+        if (
+            pack_gqa
+            and (128 % qhead_per_kvhead != 0)
+        ):
+            pack_gqa = False
+        # TODO: fix GQA + SplitKV + non-varlen
+        if pack_gqa and num_splits != 1 and cu_seqlens_q is None:
+            pack_gqa = False
+    if max_seqlen_q is None:
+        max_seqlen_q = seqlen_q if cu_seqlens_q is None else total_q
+    if max_seqlen_k is None:
+        max_seqlen_k = seqlen_k
+    seqlen_q_packgqa = max_seqlen_q * qhead_per_kvhead
+    if arch // 10 == 10:
+        q_stage = 2 if seqlen_q_packgqa > m_block_size else 1
+    else:
+        q_stage = 1
+    if num_splits < 1:
+        m_block_size_effective = q_stage * m_block_size
+        seqlen_k_loaded = max_seqlen_k if not local else max(0, min(max_seqlen_k, window_size_right + window_size_left + 1 + m_block_size))
+        num_n_blocks = (seqlen_k_loaded + n_block_size - 1) // n_block_size
+        num_m_blocks = (seqlen_q_packgqa + m_block_size_effective - 1) // m_block_size_effective
+        total_mblocks = batch_size * num_head_kv * num_m_blocks
+        num_splits = num_splits_heuristic(
+            total_mblocks,
+            torch.cuda.get_device_properties(device).multi_processor_count,
+            num_n_blocks,
+            128,
+        )
+    is_split_kv = num_splits > 1
+    if is_split_kv:
+        out_partial = torch.empty(num_splits, *q_batch_seqlen_shape, num_head, head_dim_v, dtype=torch.float32, device=device)
+        lse_partial = torch.empty(num_splits, *lse_shape, dtype=torch.float32, device=device)
+    # hash score and mask mods for compile cache
+    score_mod_hash = utils.hash_callable(score_mod) if score_mod is not None else False
+    mask_mod_hash = utils.hash_callable(mask_mod) if mask_mod is not None else False
+    if softcap is not None:
+        assert score_mod is None, "softcap and score_mod cannot be used together"
+        score_mod = utils.create_softcap_scoremod(softcap)
+    is_varlen = (
+        cu_seqlens_q is not None
+        or cu_seqlens_k is not None
+        or seqused_q is not None
+        or seqused_k is not None
+    )
+    if mask_mod is not None:
+        if is_varlen:
+            raise NotImplementedError(
+                "mask_mod with aux_tensors is not yet supported for varlen sequences. This will be fixed in a future PR."
+            )
+    if use_block_sparsity:
+        if is_varlen:
+            raise NotImplementedError(
+                "Block sparsity is not yet supported for varlen sequences. This will be fixed in a future PR."
+            )
+        # NB: pack_gqa requires block sparse head dim == 1 (broadcasted)
+        if pack_gqa and block_sparse_tensors.mask_block_cnt.shape[1] != 1:
+            pack_gqa = False
+        if is_split_kv:
+            raise NotImplementedError(
+                "Block sparsity is not yet supported with SplitKV. TODO: partition sparse block lists per split."
+            )
+    # See get_broadcast_dims for why this is needed in compile key
+    block_sparse_broadcast_pattern = None
+    normalized_block_sparse_tensors = None
+    q_subtile_factor = None
+    if block_sparse_tensors is not None:
+        if seqlen_q is None:
+            raise ValueError("Block sparsity requires fixed-length sequences (seqlen_q must be known).")
+        (
+            normalized_block_sparse_tensors,
+            block_sparse_broadcast_pattern,
+            q_subtile_factor,
+        ) = normalize_block_sparse_config(
+            block_sparse_tensors,
+            batch_size=batch_size,
+            num_head=num_head,
+            seqlen_q=seqlen_q,
+            seqlen_k=seqlen_k,
+            block_size=(m_block_size, n_block_size),
+            q_stage=q_stage,
+        )
+    if aux_tensors is not None:
+        aux_tensor_metadata = get_aux_tensor_metadata(aux_tensors)
+    else:
+        aux_tensor_metadata = None
+    compile_key = (
+        dtype,
+        head_dim,
+        head_dim_v,
+        qhead_per_kvhead,
+        causal,
+        score_mod_hash,
+        mask_mod_hash,
+        use_block_sparsity,
+        block_sparse_broadcast_pattern,
+        aux_tensor_metadata,
+        lse is None,
+        cu_seqlens_q is None,
+        cu_seqlens_k is None,
+        seqused_q is None,
+        seqused_k is None,
+        page_table is not None,
+        window_size_left is not None,
+        window_size_right is not None,
+        learnable_sink is not None,
+        m_block_size,
+        n_block_size,
+        q_stage,
+        num_threads,
+        is_split_kv,
+        pack_gqa,
+        arch,
+        page_size not in [None, 128],  # paged KV non-TMA
+        q_subtile_factor,
+    )
+    if compile_key not in _flash_attn_fwd.compile_cache:
+        (
+            cu_seqlens_q_tensor,
+            cu_seqlens_k_tensor,
+            seqused_q_tensor,
+            seqused_k_tensor,
+            learnable_sink_tensor,
+        ) = [
+            to_cute_tensor(t, assumed_align=4, leading_dim=0)
+            if t is not None
+            else None
+            for t in (cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k, learnable_sink)
+        ]
+        page_table_tensor = (
+            to_cute_tensor(page_table, assumed_align=4, leading_dim=1)
+            if page_table is not None
+            else None
+        )
+        q_tensor, k_tensor, v_tensor, o_tensor = [
+            to_cute_tensor(t) for t in (q, k, v, out if not is_split_kv else out_partial)
+        ]
+        if is_split_kv:
+            lse_tensor = to_cute_tensor(lse_partial, assumed_align=4)
+        elif lse is not None:
+            lse_tensor = to_cute_tensor(lse, assumed_align=4)
+        else:
+            lse_tensor = None
+        sparse_tensors = None
+        if normalized_block_sparse_tensors is not None:
+            sparse_tensors = to_cute_block_sparse_tensors(normalized_block_sparse_tensors)
+        cute_aux_tensors = None
+        aux_tensor_metadata = None
+        if aux_tensors is not None:
+            cute_aux_tensors = [to_cute_aux_tensor(buf) for buf in aux_tensors]
+        if arch // 10 == 9:
+            assert page_table is None, "paged KV not supported on SM 9.0"
+            assert not is_split_kv, "SplitKV not supported on SM 9.0"
+            # fa_fwd = FlashAttentionForwardSm80(
+            fa_fwd = FlashAttentionForwardSm90(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                pack_gqa=pack_gqa,
+                tile_m=m_block_size,
+                tile_n=n_block_size,
+                # num_stages=1,
+                num_stages=2,
+                num_threads=num_threads,
+                Q_in_regs=False,
+                intra_wg_overlap=True,
+                mma_pv_is_rs=True,
+                mask_mod=mask_mod,
+                score_mod=score_mod,
+                has_aux_tensors=aux_tensors is not None,
+                q_subtile_factor=q_subtile_factor,
+            )
+        elif arch // 10 in [10, 11]:
+            head_dim_padded = int(math.ceil(head_dim / 16) * 16)
+            head_dim_v_padded = int(math.ceil(head_dim / 16) * 16)
+            use_2cta_instrs = (
+                not causal
+                and not local
+                and not is_split_kv
+                and cu_seqlens_q is None
+                and seqused_q is None
+                and not use_block_sparsity
+                and page_size in [None, 128]
+                and head_dim_padded == 128
+                and head_dim_v_padded == 128
+            )
+            fa_fwd = FlashAttentionForwardSm100(
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead=qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                is_split_kv=is_split_kv,
+                pack_gqa=pack_gqa,
+                m_block_size=m_block_size,
+                n_block_size=n_block_size,
+                q_stage=q_stage,
+                is_persistent=not causal
+                    and not local
+                    and cu_seqlens_q is None
+                    and seqused_q is None
+                    and not is_split_kv,
+                score_mod=score_mod,
+                mask_mod=mask_mod,
+                has_aux_tensors=aux_tensors is not None,
+                paged_kv_non_tma=page_size not in [None, 128],
+                is_varlen_q=cu_seqlens_q is not None or seqused_q is not None,
+                q_subtile_factor=q_subtile_factor,
+                use_2cta_instrs=use_2cta_instrs,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported compute capability: {arch}. Supported: 9.x, 10.x, 11.x"
+            )
+        # TODO: check @can_implement
+        _flash_attn_fwd.compile_cache[compile_key] = cute.compile(
+            fa_fwd,
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            lse_tensor,
+            softmax_scale,
+            current_stream,
+            cu_seqlens_q_tensor,
+            cu_seqlens_k_tensor,
+            seqused_q_tensor,
+            seqused_k_tensor,
+            page_table_tensor,
+            window_size_left,
+            window_size_right,
+            learnable_sink_tensor,
+            sparse_tensors,
+            cute_aux_tensors,
+            options="--enable-tvm-ffi",
+        )
+    # In "fake mode", we will take torch fake tensors as input and the expected behaviors are:
+    # - Use those fake metadata to populate compilation cache
+    # - Return "fake" output tensors, which could be needed in follow-up fake operations
+    # Thus, we skip the actual kernel invocation here.
+    if not is_fake_mode():
+        _flash_attn_fwd.compile_cache[compile_key](
+            q.detach(),
+            k.detach(),
+            v.detach(),
+            out.detach() if not is_split_kv else out_partial,
+            lse_partial if is_split_kv else lse,
+            softmax_scale,
+            current_stream,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            page_table,
+            window_size_left,
+            window_size_right,
+            learnable_sink,
+            normalized_block_sparse_tensors[:4] if normalized_block_sparse_tensors is not None else None,
+            aux_tensors,
+        )
+    if is_split_kv:
+        _flash_attn_fwd_combine(
+            out_partial,
+            lse_partial.transpose(-1, -2),
+            out,
+            lse.transpose(-1, -2) if lse is not None else None,
+            cu_seqlens_q,
+            seqused_q,
+        )
+    return out, lse
+_flash_attn_fwd.compile_cache = get_jit_cache("fwd")
+def _flash_attn_bwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    dout: torch.Tensor,
+    lse: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    softcap: float = 0.0,
+    window_size_left: Optional[int] = None,
+    window_size_right: Optional[int] = None,
+    m_block_size: int = 64,
+    n_block_size: int = 128,
+    num_threads: int = 256,
+    pack_gqa: bool = False,
+    num_stages_Q: int = 2,
+    num_stages_dO: int = 2,
+    SdP_swapAB: bool = False,
+    dKV_swapAB: bool = False,
+    dQ_swapAB: bool = False,
+    AtomLayoutMSdP: int = 2,
+    AtomLayoutNdKV: int = 2,
+    AtomLayoutMdQ: int = 2,
+    V_in_regs: bool = False,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    deterministic: bool = False,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    score_mod: Optional[Callable] = None,
+    score_mod_bwd: Optional[Callable] = None,
+    mask_mod: Optional[Callable] = None,
+    aux_tensors: Optional[list[torch.Tensor]] = None,
+    block_sparse_tensors: Optional[BlockSparseTensorsTorch] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    arch = _get_device_arch()
+    assert arch // 10 in [9, 10, 11], "Unsupported compute capability. Supported: 9.x, 10.x, 11.x"
+    num_head, head_dim = q.shape[-2:]
+    if causal:
+        window_size_right = 0
+    if window_size_left is not None and window_size_right is not None and window_size_left + window_size_right < 0:
+        window_size_left = None
+        window_size_right = None
+    local = window_size_left is not None or window_size_right is not None
+    if local:
+        if window_size_left is None and window_size_right == 0:
+            causal, local = True, False
+            window_size_right = None
+        else:
+            causal, local = False, True
+    if arch // 10 == 9:
+        m_block_size = 80 if not causal else 64
+        n_block_size = 128
+        num_stages_Q = 2
+        num_stages_dO = 2
+        num_stages_PdS = 2
+        SdP_swapAB = True
+        dKV_swapAB = False
+        dQ_swapAB = not causal
+        AtomLayoutMSdP = 1
+        AtomLayoutNdKV = 2
+        AtomLayoutMdQ = 1
+        cluster_size = 1
+        use_2cta_instrs = False
+        assert window_size_left is None and window_size_right is None, "local not supported yet on 9.x"
+        is_varlen = (
+            cu_seqlens_q is not None
+            or cu_seqlens_k is not None
+            or seqused_q is not None
+            or seqused_k is not None
+        )
+        assert not is_varlen, "varlen backward is not yet supported on sm90"
+    else:
+        m_block_size = 128
+        n_block_size = 128
+        dQ_swapAB = False
+        dKV_swapAB = False
+        AtomLayoutMdQ = 1
+        AtomLayoutNdKV = 1
+        disable_2cta = (
+            local
+            or score_mod is not None
+            or score_mod_bwd is not None
+            or mask_mod is not None
+        )
+        cluster_size = 2 if head_dim >= 128 and not disable_2cta else 1
+        use_2cta_instrs = cluster_size==2
+    q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = [
+        maybe_contiguous(t)
+        for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
+    ]
+    if cu_seqlens_q is None:
+        batch_size, seqlen_q = q.shape[:2]
+        total_q = batch_size * seqlen_q
+    else:
+        batch_size = cu_seqlens_q.shape[0] - 1
+        total_q = q.shape[0]
+        seqlen_q = max_seqlen_q if max_seqlen_q is not None else total_q
+    if cu_seqlens_k is None:
+        batch_size, seqlen_k = k.shape[:2]
+        total_k = batch_size * seqlen_k
+    else:
+        batch_size = cu_seqlens_k.shape[0] - 1
+        total_k = k.shape[0]
+        seqlen_k = max_seqlen_k if max_seqlen_k is not None else total_k
+    num_head_kv = k.shape[-2]
+    head_dim_v = v.shape[-1]
+    use_block_sparsity = block_sparse_tensors is not None
+    # SM90 block-sparse backward: tile_m=64 is the GCD between a m_block_size that fits,
+    # the base block_m of 128 from forward, and block-sparse size for subtiling.
+    if arch // 10 == 9 and use_block_sparsity:
+        m_block_size = 64
+        # dQ_swapAB tuning: use False when m_block_size=64 (same as causal case)
+        dQ_swapAB = False
+    # NB: this could be derived from the block_sparse_tensors but for now we hardcode it to 2
+    subtile_factor = 2
+    seqlen_q_rounded = (seqlen_q + m_block_size - 1) // m_block_size * m_block_size
+    seqlen_k_rounded = (seqlen_k + n_block_size - 1) // n_block_size * n_block_size
+    num_n_blocks = seqlen_k_rounded // n_block_size
+    if cluster_size == 2 and num_n_blocks % cluster_size != 0:
+        seqlen_k_rounded = seqlen_k_rounded + n_block_size
+    if cu_seqlens_k is None:
+        assert k.shape == (batch_size, seqlen_k, num_head_kv, head_dim)
+        assert v.shape == (batch_size, seqlen_k, num_head_kv, head_dim_v)
+    else:
+        assert k.shape == (total_k, num_head_kv, head_dim)
+        assert v.shape == (total_k, num_head_kv, head_dim_v)
+        assert cu_seqlens_k.shape == (batch_size + 1,), (
+            "cu_seqlens_k must have shape (batch_size + 1,)"
+        )
+    if cu_seqlens_q is not None:
+        assert cu_seqlens_q.shape == (batch_size + 1,), (
+            "cu_seqlens_q must have shape (batch_size + 1,)"
+        )
+        assert out.shape == (total_q, num_head, head_dim_v)
+        assert dout.shape == (total_q, num_head, head_dim_v)
+        assert lse.shape == (num_head, total_q), "lse must have shape (num_head, total_q)"
+    else:
+        assert out.shape == (batch_size, seqlen_q, num_head, head_dim_v)
+        assert dout.shape == (batch_size, seqlen_q, num_head, head_dim_v)
+        assert lse.shape == (batch_size, num_head, seqlen_q), (
+            "lse must have shape (batch_size, num_head, seqlen_q)"
+        )
+    assert q.dtype in [torch.float16, torch.bfloat16], "inputs must be float16 or bfloat16"
+    assert q.dtype == k.dtype == v.dtype == out.dtype == dout.dtype, (
+        "inputs must have the same dtype"
+    )
+    for t in [cu_seqlens_q, cu_seqlens_k]:
+        if t is not None:
+            assert t.dtype == torch.int32, "cu_seqlens_q, cu_seqlens_k must be int32"
+    assert lse.dtype == torch.float32, "lse must be float32"
+    assert all(
+        t is None or t.is_cuda for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k)
+    ), "inputs must be on CUDA device"
+    assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
+    assert head_dim <= 256, "head_dim must be less than or equal to 256"
+    alignment = 16 // q.element_size()
+    assert head_dim % alignment == 0, f"head_dim must be divisible by {alignment}"
+    assert head_dim_v % alignment == 0, f"head_dim_v must be divisible by {alignment}"
+    if softmax_scale is None:
+        softmax_scale = 1.0 / math.sqrt(head_dim)
+    qhead_per_kvhead = num_head // num_head_kv
+    if pack_gqa is None:
+        pack_gqa = qhead_per_kvhead > 1
+    # pack_gqa backward not yet supported in bwd
+    pack_gqa = False
+    if arch // 10 not in [10, 11]:
+        assert deterministic is False, "bwd deterministic only supported for sm100/sm110 for now"
+    if score_mod is not None:
+        assert score_mod_bwd is not None, "score_mod_bwd is required when score_mod is provided"
+        assert softcap == 0.0, "softcap and score_mod are mutually exclusive (different log2 scaling)"
+        assert cu_seqlens_q is None and cu_seqlens_k is None, (
+            "varlen + score_mod not supported in bwd yet"
+        )
+    device = q.device
+    out_torch_dtype = q.dtype
+    if dq is None:
+        dq = torch.empty_like(q)
+    else:
+        _validate_tensor(dq, "dq", q.shape, out_torch_dtype, device)
+    if dk is None:
+        dk = torch.empty_like(k)
+    else:
+        _validate_tensor(dk, "dk", k.shape, out_torch_dtype, device)
+    if dv is None:
+        dv = torch.empty_like(v)
+    else:
+        _validate_tensor(dv, "dv", v.shape, out_torch_dtype, device)
+    head_dim_rounded = (head_dim + 32 - 1) // 32 * 32
+    if cu_seqlens_q is None:
+        dq_accum = torch.empty(
+            batch_size,
+            num_head,
+            seqlen_q_rounded * head_dim_rounded,
+            dtype=torch.float32,
+            device=device,
+        )
+        dpsum = torch.empty(
+            batch_size, num_head, seqlen_q_rounded, dtype=torch.float32, device=device
+        )
+        lse_log2 = torch.empty(
+            batch_size, num_head, seqlen_q_rounded, dtype=torch.float32, device=device
+        )
+    else:
+        total_q_rounded_padded = (
+            (total_q + cu_seqlens_q.shape[0] * m_block_size - 1) // m_block_size * m_block_size
+        )
+        dq_accum = torch.empty(
+            num_head, total_q_rounded_padded * head_dim_rounded, dtype=torch.float32, device=device
+        )
+        dpsum = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
+        lse_log2 = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
+    dKV_postprocess = qhead_per_kvhead > 1
+    if dKV_postprocess:
+        head_dim_v_rounded = (head_dim_v + 32 - 1) // 32 * 32
+        if cu_seqlens_k is None:
+            dk_accum = torch.zeros(
+                batch_size,
+                num_head_kv,
+                seqlen_k_rounded * head_dim_rounded,
+                dtype=torch.float32,
+                device=device,
+            )
+            dv_accum = torch.zeros(
+                batch_size,
+                num_head_kv,
+                seqlen_k_rounded * head_dim_v_rounded,
+                dtype=torch.float32,
+                device=device,
+            )
+        else:
+            cluster_tile_n = cluster_size * n_block_size
+            total_k_rounded_padded = (
+                (total_k + cu_seqlens_k.shape[0] * cluster_tile_n - 1) // cluster_tile_n * cluster_tile_n
+            )
+            dk_accum = torch.zeros(
+                num_head_kv,
+                total_k_rounded_padded * head_dim_rounded,
+                dtype=torch.float32,
+                device=device,
+            )
+            dv_accum = torch.zeros(
+                num_head_kv,
+                total_k_rounded_padded * head_dim_v_rounded,
+                dtype=torch.float32,
+                device=device,
+            )
+    dtype = torch2cute_dtype_map[q.dtype]
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    if deterministic:
+        dQ_semaphore = torch.zeros(batch_size, num_head, seqlen_q_rounded // m_block_size, cluster_size, dtype=torch.int32, device="cuda")
+    else:
+        dQ_semaphore = None
+    if deterministic and qhead_per_kvhead > 1:
+        dK_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device="cuda")
+        dV_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device="cuda")
+    else:
+        dK_semaphore = None
+        dV_semaphore = None
+    # Preprocess kernel: compute (o * dout).sum(dim=-1), lse * log2_e, and zero out dq_accum.
+    compile_key_pre = (
+        arch,
+        dtype,
+        head_dim,
+        head_dim_v,
+        m_block_size,
+        num_threads,
+        cu_seqlens_q is None,
+        seqused_q is None,
+        get_broadcast_dims(out),
+        get_broadcast_dims(dout),
+    )
+    if compile_key_pre not in _flash_attn_bwd.compile_cache_pre:
+        o_tensor, do_tensor = [to_cute_tensor(t) for t in (out, dout)]
+        dq_accum_tensor, dpsum_tensor, lse_log2_tensor = [
+            to_cute_tensor(t) for t in (dq_accum, dpsum, lse_log2)
+        ]
+        lse_tensor = to_cute_tensor(lse, assumed_align=4)
+        cu_seqlens_q_tensor, seqused_q_tensor = [
+            to_cute_tensor(t, assumed_align=4) if t is not None else None
+            for t in (cu_seqlens_q, seqused_q)
+        ]
+        fa_bwd_pre = FlashAttentionBackwardPreprocess(
+            dtype,
+            head_dim,
+            head_dim_v,
+            arch,
+            m_block_size,
+            num_threads=num_threads,
+        )
+        # TODO: check @can_implement
+        _flash_attn_bwd.compile_cache_pre[compile_key_pre] = cute.compile(
+            fa_bwd_pre,
+            o_tensor,
+            do_tensor,
+            dpsum_tensor,
+            lse_tensor,
+            lse_log2_tensor,
+            dq_accum_tensor,
+            cu_seqlens_q_tensor,
+            seqused_q_tensor,
+            current_stream,
+            options="--enable-tvm-ffi",
+        )
+    if not is_fake_mode():
+        _flash_attn_bwd.compile_cache_pre[compile_key_pre](
+            out,
+            dout,
+            dpsum,
+            lse,
+            lse_log2,
+            dq_accum,
+            cu_seqlens_q,
+            seqused_q,
+            current_stream,
+        )
+    # NB num_threads application for 3 kernels
+    # There are pre, main, post processing kernels, currenlty num_threads is only actually
+    # used for the pre proc, and then we hard code to 384 for the main and post proc, and we do
+    # before cache key gen
+    num_threads = 384
+    # Backward kernel: compute dk, dv, dq_accum.
+    score_mod_hash = utils.hash_callable(score_mod) if score_mod else False
+    score_mod_bwd_hash = utils.hash_callable(score_mod_bwd) if score_mod_bwd else False
+    mask_mod_hash = utils.hash_callable(mask_mod) if mask_mod else False
+    num_aux_tensors = len(aux_tensors) if aux_tensors else 0
+    cute_aux_tensors = None
+    if aux_tensors is not None:
+        cute_aux_tensors = [to_cute_tensor(buf, assumed_align=None, fully_dynamic=True) for buf in aux_tensors]
+    block_sparse_broadcast_pattern = None
+    normalized_block_sparse_tensors = None
+    if block_sparse_tensors is not None:
+        (
+            normalized_block_sparse_tensors,
+            block_sparse_broadcast_pattern,
+        ) = normalize_block_sparse_config_bwd(
+            block_sparse_tensors,
+            batch_size=batch_size,
+            num_head=num_head,
+            seqlen_q=seqlen_q,
+            seqlen_k=seqlen_k,
+            block_size=(m_block_size, n_block_size),
+            subtile_factor=subtile_factor,
+        )
+    if arch // 10 == 9:
+        compile_key = (
+            arch,
+            dtype,
+            head_dim,
+            head_dim_v,
+            qhead_per_kvhead,
+            causal,
+            softcap != 0.0,
+            m_block_size,
+            n_block_size,
+            num_threads,
+            pack_gqa,
+            num_stages_Q,
+            num_stages_dO,
+            SdP_swapAB,
+            dKV_swapAB,
+            dQ_swapAB,
+            AtomLayoutMSdP,
+            AtomLayoutNdKV,
+            AtomLayoutMdQ,
+            V_in_regs,
+            cu_seqlens_q is None,
+            cu_seqlens_k is None,
+            seqused_q is None,
+            seqused_k is None,
+            score_mod_hash,
+            score_mod_bwd_hash,
+            mask_mod_hash,
+            num_aux_tensors,
+            use_block_sparsity,
+            block_sparse_broadcast_pattern,
+            get_broadcast_dims(q),
+            get_broadcast_dims(k),
+            get_broadcast_dims(v),
+            get_broadcast_dims(dout),
+        )
+    else:
+        compile_key = (
+            arch,
+            dtype,
+            head_dim,
+            head_dim_v,
+            qhead_per_kvhead,
+            causal,
+            window_size_left is not None,
+            window_size_right is not None,
+            softcap != 0.0,
+            m_block_size,
+            n_block_size,
+            num_threads,
+            pack_gqa,
+            cluster_size,
+            use_2cta_instrs,
+            deterministic,
+            score_mod_hash,
+            score_mod_bwd_hash,
+            mask_mod_hash,
+            num_aux_tensors,
+            use_block_sparsity,
+            block_sparse_broadcast_pattern,
+            cu_seqlens_q is None,
+            cu_seqlens_k is None,
+            seqused_q is None,
+            seqused_k is None,
+            get_broadcast_dims(q),
+            get_broadcast_dims(k),
+            get_broadcast_dims(v),
+            get_broadcast_dims(dout),
+        )
+    if compile_key not in _flash_attn_bwd.compile_cache:
+        q_tensor, k_tensor, v_tensor, do_tensor, dq_tensor, dk_tensor, dv_tensor = [
+            to_cute_tensor(t) for t in (q, k, v, dout, dq, dk, dv)
+        ]
+        dq_accum_tensor, dpsum_tensor, lse_log2_tensor = [
+            to_cute_tensor(t) for t in (dq_accum, dpsum, lse_log2)
+        ]
+        if dKV_postprocess:
+            dk_accum_tensor, dv_accum_tensor = [
+                to_cute_tensor(t) for t in (dk_accum, dv_accum)
+            ]
+        cu_seqlens_q_tensor, cu_seqlens_k_tensor, seqused_q_tensor, seqused_k_tensor = [
+            to_cute_tensor(t, assumed_align=4) if t is not None else None
+            for t in (cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
+        ]
+        dQ_semaphore_tensor, dK_semaphore_tensor, dV_semaphore_tensor = [
+            utils.convert_from_dlpack_leading_static(t.detach(), leading_dim=3, alignment=4, stride_order=t.dim_order())
+            if t is not None else None
+            for t in (dQ_semaphore, dK_semaphore, dV_semaphore)
+        ]
+        fa_bwd_sm80 = FlashAttentionBackwardSm80(
+            dtype,
+            head_dim,
+            head_dim_v,
+            qhead_per_kvhead,
+            m_block_size,
+            n_block_size,
+            num_stages_Q,
+            num_stages_dO,
+            num_threads,
+            pack_gqa,
+            causal,
+            SdP_swapAB,
+            dKV_swapAB,
+            dQ_swapAB,
+            AtomLayoutMSdP,
+            AtomLayoutNdKV,
+            AtomLayoutMdQ,
+            V_in_regs=V_in_regs,
+        )
+        if arch // 10 == 9:
+            fa_bwd_obj = FlashAttentionBackwardSm90(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                causal,
+                m_block_size,
+                n_block_size,
+                num_stages_Q,
+                num_stages_dO,
+                num_stages_PdS,
+                SdP_swapAB,
+                dKV_swapAB,
+                dQ_swapAB,
+                AtomLayoutMSdP,
+                AtomLayoutNdKV,
+                AtomLayoutMdQ,
+                num_threads,
+                V_in_regs=V_in_regs,
+                score_mod=score_mod,
+                score_mod_bwd=score_mod_bwd,
+                mask_mod=mask_mod,
+                has_aux_tensors=aux_tensors is not None,
+                subtile_factor=subtile_factor,
+            )
+        else:
+            fa_bwd_obj = FlashAttentionBackwardSm100(
+                head_dim,
+                head_dim_v,
+                is_causal=causal,
+                is_local=local,
+                qhead_per_kvhead=qhead_per_kvhead,
+                tile_m=m_block_size,
+                tile_n=n_block_size,
+                cluster_size=cluster_size,
+                use_2cta_instrs=use_2cta_instrs,
+                deterministic=deterministic,
+                score_mod=score_mod,
+                score_mod_bwd=score_mod_bwd,
+                mask_mod=mask_mod,
+                has_aux_tensors=aux_tensors is not None,
+                subtile_factor=subtile_factor,
+            )
+        # Block sparse tensors for backward use Q-direction indexing (transposed from forward).
+        sparse_tensors_compile = None
+        if normalized_block_sparse_tensors is not None:
+            sparse_tensors_compile = to_cute_block_sparse_tensors(normalized_block_sparse_tensors)
+        # TODO: check @can_implement
+        _flash_attn_bwd.compile_cache[compile_key] = cute.compile(
+            fa_bwd_obj,
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            do_tensor,
+            lse_log2_tensor,
+            dpsum_tensor,
+            dq_accum_tensor,
+            dk_tensor if not dKV_postprocess else dk_accum_tensor,
+            dv_tensor if not dKV_postprocess else dv_accum_tensor,
+            softmax_scale,
+            current_stream,
+            cu_seqlens_q_tensor,
+            cu_seqlens_k_tensor,
+            seqused_q_tensor,
+            seqused_k_tensor,
+            None,  # softcap - not yet supported in backward
+            window_size_left,
+            window_size_right,
+            dQ_semaphore_tensor,
+            dK_semaphore_tensor,
+            dV_semaphore_tensor,
+            cute_aux_tensors,
+            sparse_tensors_compile,
+            options="--enable-tvm-ffi",
+        )
+    if not is_fake_mode():
+        _flash_attn_bwd.compile_cache[compile_key](
+            q.detach(),
+            k.detach(),
+            v.detach(),
+            dout,
+            lse_log2,
+            dpsum,
+            dq_accum,
+            dk if not dKV_postprocess else dk_accum,
+            dv if not dKV_postprocess else dv_accum,
+            softmax_scale,
+            current_stream,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            None,  # softcap - not yet supported in backward
+            window_size_left,
+            window_size_right,
+            dQ_semaphore,
+            dK_semaphore,
+            dV_semaphore,
+            aux_tensors,
+            normalized_block_sparse_tensors[:4] if normalized_block_sparse_tensors is not None else None,
+        )
+    num_threads = 256 if arch // 10 == 9 else 128
+    # Postprocess kernel: convert dq_accum from float32 to dq in bf16/fp16
+    compile_key_post = (
+        arch,
+        dtype,
+        head_dim,
+        m_block_size,
+        num_threads,
+        AtomLayoutMdQ,
+        dQ_swapAB,
+        cu_seqlens_q is None,
+        seqused_q is None,
+        use_2cta_instrs,
+        1, # no cluster for tile_m
+        get_broadcast_dims(dq_accum),
+        get_broadcast_dims(dq),
+    )
+    if compile_key_post not in _flash_attn_bwd.compile_cache_post:
+        dq_accum_tensor = to_cute_tensor(dq_accum)
+        dq_tensor = to_cute_tensor(dq)
+        cu_seqlens_q_tensor, seqused_q_tensor = [
+            to_cute_tensor(t, assumed_align=4) if t is not None else None
+            for t in (cu_seqlens_q, seqused_q)
+        ]
+        fa_bwd_post = FlashAttentionBackwardPostprocess(
+            dtype, head_dim, arch, m_block_size, num_threads, AtomLayoutMdQ, dQ_swapAB,
+            use_2cta_instrs=use_2cta_instrs,
+        )
+        # TODO: check @can_implement
+        _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
+            fa_bwd_post,
+            dq_accum_tensor,
+            dq_tensor,
+            softmax_scale,
+            cu_seqlens_q_tensor,
+            seqused_q_tensor,
+            current_stream,
+            options="--enable-tvm-ffi",
+        )
+    if not is_fake_mode():
+        _flash_attn_bwd.compile_cache_post[compile_key_post](
+            dq_accum,
+            dq,
+            softmax_scale,
+            cu_seqlens_q,
+            seqused_q,
+            current_stream,
+        )
+    if dKV_postprocess:
+        # Postprocess kernel: convert dk_accum & dv_accum from float32 to bf16/fp16
+        compile_key_post = (
+            arch,
+            dtype,
+            head_dim,
+            n_block_size,
+            num_threads,
+            AtomLayoutNdKV,
+            dKV_swapAB,
+            cu_seqlens_k is None,
+            seqused_k is None,
+            False, # even for 2cta, is split along hdim, so always False
+            cluster_size, # cluster is for tile_n
+            get_broadcast_dims(dk_accum),
+            get_broadcast_dims(dk),
+        )
+        if compile_key_post not in _flash_attn_bwd.compile_cache_post:
+            dk_accum_tensor = to_cute_tensor(dk_accum)
+            dk_tensor = to_cute_tensor(dk)
+            cu_seqlens_k_tensor, seqused_k_tensor = [
+                to_cute_tensor(t, assumed_align=4) if t is not None else None
+                for t in (cu_seqlens_k, seqused_k)
+            ]
+            fa_bwd_post = FlashAttentionBackwardPostprocess(
+                dtype, head_dim, arch, n_block_size, num_threads, AtomLayoutNdKV, dKV_swapAB,
+                cluster_size=cluster_size,
+            )
+            # TODO: check @can_implement
+            _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
+                fa_bwd_post,
+                dk_accum_tensor,
+                dk_tensor,
+                softmax_scale,
+                cu_seqlens_k_tensor,
+                seqused_k_tensor,
+                current_stream,
+                options="--enable-tvm-ffi",
+            )
+        if not is_fake_mode():
+            _flash_attn_bwd.compile_cache_post[compile_key_post](
+                dk_accum,
+                dk,
+                softmax_scale,
+                cu_seqlens_k,
+                seqused_k,
+                current_stream,
+            )
+        compile_key_post = (
+            arch,
+            dtype,
+            head_dim_v,
+            n_block_size,
+            num_threads,
+            AtomLayoutNdKV,
+            dKV_swapAB,
+            cu_seqlens_k is None,
+            seqused_k is None,
+            False,
+            cluster_size,
+            get_broadcast_dims(dv_accum),
+            get_broadcast_dims(dv),
+        )
+        if compile_key_post not in _flash_attn_bwd.compile_cache_post:
+            dv_accum_tensor = to_cute_tensor(dv_accum)
+            dv_tensor = to_cute_tensor(dv)
+            cu_seqlens_k_tensor, seqused_k_tensor = [
+                to_cute_tensor(t, assumed_align=4) if t is not None else None
+                for t in (cu_seqlens_k, seqused_k)
+            ]
+            fa_bwd_post = FlashAttentionBackwardPostprocess(
+                dtype, head_dim_v, arch, n_block_size, num_threads, AtomLayoutNdKV, dKV_swapAB,
+                cluster_size=cluster_size,
+            )
+            # TODO: check @can_implement
+            _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
+                fa_bwd_post,
+                dv_accum_tensor,
+                dv_tensor,
+                cutlass.Float32(1.0),
+                cu_seqlens_k_tensor,
+                seqused_k_tensor,
+                current_stream,
+                options="--enable-tvm-ffi",
+            )
+        if not is_fake_mode():
+            _flash_attn_bwd.compile_cache_post[compile_key_post](
+                dv_accum,
+                dv,
+                1.0,
+                cu_seqlens_k,
+                seqused_k,
+                current_stream,
+            )
+    return dq, dk, dv
+_flash_attn_bwd.compile_cache_pre = get_jit_cache("bwd_pre")
+_flash_attn_bwd.compile_cache = get_jit_cache("bwd")
+_flash_attn_bwd.compile_cache_post = get_jit_cache("bwd_post")
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        softmax_scale: Optional[float] = None,
+        causal: bool = False,
+        window_size: Tuple[Optional[int], Optional[int]] = (None, None),
+        learnable_sink: Optional[torch.Tensor] = None,
+        softcap: float = 0.0,
+        num_splits: int = 1,
+        pack_gqa: Optional[bool] = None,
+        deterministic: bool = False,
+        mask_mod: Optional[Callable] = None,
+        full_block_cnt: Optional[torch.Tensor] = None,
+        full_block_idx: Optional[torch.Tensor] = None,
+        mask_block_cnt: Optional[torch.Tensor] = None,
+        mask_block_idx: Optional[torch.Tensor] = None,
+        block_size: Optional[Tuple[int, int]] = None,
+        return_lse: bool = False,
+    ):
+        # Only create block sparse tensors if at least one block sparse parameter is provided
+        block_sparse_tensors = None
+        if any(t is not None for t in [full_block_cnt, full_block_idx, mask_block_cnt, mask_block_idx]):
+            block_sparse_tensors = BlockSparseTensorsTorch(
+                full_block_cnt=full_block_cnt,
+                full_block_idx=full_block_idx,
+                mask_block_cnt=mask_block_cnt,
+                mask_block_idx=mask_block_idx,
+                block_size=block_size,
+            )
+        out, lse = _flash_attn_fwd(
+            q,
+            k,
+            v,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
+            learnable_sink=learnable_sink,
+            softcap=softcap,
+            num_splits=num_splits,
+            pack_gqa=pack_gqa,
+            mask_mod=mask_mod,
+            block_sparse_tensors=block_sparse_tensors,
+            return_lse=return_lse,
+        )
+        ctx.save_for_backward(q, k, v, out, lse)
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.softcap = softcap
+        ctx.deterministic = deterministic
+        # LSE gradient is not supported yet
+        if lse is not None:
+            ctx.mark_non_differentiable(lse)
+        return out, lse
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, lse = ctx.saved_tensors
+        dq, dk, dv = _flash_attn_bwd(
+            q,
+            k,
+            v,
+            out,
+            dout,
+            lse,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.softcap,
+            window_size_left=ctx.window_size[0],
+            window_size_right=ctx.window_size[1],
+            deterministic=ctx.deterministic,
+        )
+        return dq, dk, dv, *((None,) * 20)  # Extra Nones is fine
+class FlashAttnVarlenFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: Optional[torch.Tensor],
+        cu_seqlens_k: Optional[torch.Tensor],
+        seqused_q: Optional[torch.Tensor] = None,
+        seqused_k: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_k: Optional[int] = None,
+        page_table: Optional[torch.Tensor] = None,
+        softmax_scale: Optional[float] = None,
+        causal: bool = False,
+        window_size: Tuple[Optional[int], Optional[int]] = (None, None),
+        learnable_sink: Optional[torch.Tensor] = None,
+        softcap: float = 0.0,
+        num_splits: int = 1,
+        pack_gqa: Optional[bool] = None,
+        deterministic: bool = False,
+        score_mod: Optional[Callable] = None,
+        aux_tensors: Optional[list] = None,
+        return_lse: bool = False,
+    ):
+        out, lse = _flash_attn_fwd(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            page_table=page_table,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
+            learnable_sink=learnable_sink,
+            softcap=softcap,
+            num_splits=num_splits,
+            pack_gqa=pack_gqa,
+            score_mod=score_mod,
+            aux_tensors=aux_tensors,
+            return_lse=return_lse,
+        )
+        ctx.save_for_backward(q, k, v, out, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.softcap = softcap
+        ctx.deterministic = deterministic
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        # LSE gradient is not supported yet
+        if lse is not None:
+            ctx.mark_non_differentiable(lse)
+        return out, lse
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = ctx.saved_tensors
+        assert ctx.softcap == 0.0
+        dq, dk, dv = _flash_attn_bwd(
+            q,
+            k,
+            v,
+            out,
+            dout,
+            lse,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.softcap,
+            window_size_left=ctx.window_size[0],
+            window_size_right=ctx.window_size[1],
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            seqused_q=seqused_q,
+            seqused_k=seqused_k,
+            max_seqlen_q=ctx.max_seqlen_q,
+            max_seqlen_k=ctx.max_seqlen_k,
+            deterministic=ctx.deterministic,
+        )
+        return dq, dk, dv, *((None,) * 20)
+def flash_attn_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: Tuple[Optional[int], Optional[int]] = (None, None),
+    learnable_sink: Optional[torch.Tensor] = None,
+    softcap: float = 0.0,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    deterministic: bool = False,
+    mask_mod: Optional[Callable] = None,
+    full_block_cnt: Optional[torch.Tensor] = None,
+    full_block_idx: Optional[torch.Tensor] = None,
+    mask_block_cnt: Optional[torch.Tensor] = None,
+    mask_block_idx: Optional[torch.Tensor] = None,
+    block_size: Optional[Tuple[int, int]] = None,
+    return_lse: bool = False,
+):
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        softmax_scale,
+        causal,
+        window_size,
+        learnable_sink,
+        softcap,
+        num_splits,
+        pack_gqa,
+        deterministic,
+        mask_mod,
+        full_block_cnt,
+        full_block_idx,
+        mask_block_cnt,
+        mask_block_idx,
+        block_size,
+        return_lse,
+    )
+def flash_attn_varlen_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: Tuple[Optional[int], Optional[int]] = (None, None),
+    learnable_sink: Optional[torch.Tensor] = None,
+    softcap: float = 0.0,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    deterministic: bool = False,
+    score_mod: Optional[Callable] = None,
+    aux_tensors: Optional[list] = None,
+    return_lse: bool = False,
+):
+    return FlashAttnVarlenFunc.apply(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_q,
+        seqused_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        page_table,
+        softmax_scale,
+        causal,
+        window_size,
+        learnable_sink,
+        softcap,
+        num_splits,
+        pack_gqa,
+        deterministic,
+        score_mod,
+        aux_tensors,
+        return_lse,
+    )
+def _flash_attn_fwd_combine(
+    out_partial: torch.Tensor,
+    lse_partial: torch.Tensor,
+    out: torch.Tensor,
+    lse: Optional[torch.Tensor] = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    seqused: Optional[torch.Tensor] = None,
+    num_splits_dynamic_ptr: Optional[torch.Tensor] = None,
+    semaphore_to_reset: Optional[torch.Tensor] = None,
+) -> None:
+    """Forward combine kernel for split attention computation.
+    Combines partial outputs and log-sum-exp values from multiple splits
+    of attention computation into final outputs.
+    Args:
+        out_partial: Partial outputs tensor (num_splits, batch, seqlen, nheads, headdim) or
+                                            (num_splits, total_q, nheads, headdim) if there's cu_seqlens
+        lse_partial: Partial LSE tensor (num_splits, batch, seqlen, nheads) or
+                                       (num_splits, total_q, nheads) if there's cu_seqlens
+        out: Output tensor (batch, seqlen, nheads, headdim) or (total_q, nheads, headdim) if there's cu_seqlens
+        lse: Output LSE tensor (batch, seqlen, nheads) or (total_q, nheads) if there's cu_seqlens.
+        cu_seqlens: Cumulative sequence lengths for variable length sequences
+        seqused: Used sequence lengths for each batch
+        num_splits_dynamic_ptr: Dynamic number of splits per batch
+        semaphore_to_reset: Semaphore for synchronization
+        k_block_size: Block size for head dimension
+    Returns:
+        None
+    """
+    # Input validation
+    assert out_partial.dim() in [4, 5], "out_partial must have 4 or 5 dimensions"
+    assert lse_partial.dim() in [3, 4], "lse_partial must have 3 or 4 dimensions"
+    assert out_partial.dtype in [torch.float16, torch.bfloat16, torch.float32], (
+        "out_partial must be fp16, bf16, or fp32"
+    )
+    assert lse_partial.dtype == torch.float32, "lse_partial must be fp32"
+    assert out_partial.is_cuda and lse_partial.is_cuda, "tensors must be on CUDA device"
+    assert out_partial.stride(-1) == 1, "out_partial must be contiguous in the last dimension"
+    assert lse_partial.stride(-2) == 1, "lse_partial must be contiguous in the seqlen dimension"
+    assert lse_partial.shape == out_partial.shape[:-1]
+    # Determine if this is variable length based on dimensions
+    is_varlen = out_partial.dim() == 4
+    # Validate output tensor shapes and types
+    assert out.shape == out_partial.shape[1:], "out shape mismatch"
+    if lse is not None:
+        assert lse.shape == lse_partial.shape[1:], "lse shape mismatch"
+        assert lse.dtype == torch.float32, "lse must be fp32"
+    # Validate optional tensors
+    for t, name in [
+        (cu_seqlens, "cu_seqlens"),
+        (seqused, "seqused"),
+        (num_splits_dynamic_ptr, "num_splits_dynamic_ptr"),
+    ]:
+        if t is not None:
+            assert t.dtype == torch.int32, f"{name} must be int32"
+            assert t.is_cuda, f"{name} must be on CUDA device"
+            assert t.is_contiguous(), f"{name} must be contiguous"
+    head_dim = out_partial.shape[-1]
+    num_splits = out_partial.shape[0]
+    assert num_splits <= 256
+    # If hdim is 96 or 192, it's faster to round them to 128 or 256 respectively
+    # so that kBlockM is smaller and we have more parallelism.
+    k_block_size = 64 if head_dim <= 64 else 128
+    # We want kBlockM to be as small as possible to maximize parallelism.
+    # E.g., if hdim is 64, we want kBlockM to be 16 so that we can use 256 threads, each reading 4 elements (floats).
+    m_block_size = 8 if k_block_size % 128 == 0 else (16 if k_block_size % 64 == 0 else 32)
+    log_max_splits = max(math.ceil(math.log2(num_splits)), 4)
+    if m_block_size == 8:
+        # If kBlockM == 8 then the minimum number of splits is 32.
+        # TODO: we can deal w this by using 128 threads instead
+        log_max_splits = max(log_max_splits, 5)
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    # Create combine kernel configuration
+    dtype = torch2cute_dtype_map[out.dtype]
+    dtype_partial = torch2cute_dtype_map[out_partial.dtype]
+    compile_key = (
+        dtype,
+        dtype_partial,
+        head_dim,
+        m_block_size,
+        k_block_size,
+        log_max_splits,
+        cu_seqlens is not None,
+        seqused is not None,
+        lse is not None,
+    )
+    if compile_key not in _flash_attn_fwd_combine.compile_cache:
+        out_partial_tensor = to_cute_tensor(
+            out_partial, leading_dim=4 if not is_varlen else 3
+        )
+        lse_partial_tensor = to_cute_tensor(
+            lse_partial, assumed_align=4, leading_dim=lse_partial.ndim - 2
+        )
+        out_tensor = to_cute_tensor(out, leading_dim=3 if not is_varlen else 2)
+        lse_tensor = (
+            to_cute_tensor(lse, assumed_align=4, leading_dim=lse.ndim - 2)
+            if lse is not None
+            else None
+        )
+        optional_tensors = [
+            to_cute_tensor(t, assumed_align=4, leading_dim=0)
+            if t is not None
+            else None
+            for t in (cu_seqlens, seqused, num_splits_dynamic_ptr, semaphore_to_reset)
+        ]
+        cu_seqlens_tensor, seqused_tensor, num_splits_dynamic_tensor, semaphore_tensor = (
+            optional_tensors
+        )
+        fa_combine = FlashAttentionForwardCombine(
+            dtype=dtype,
+            dtype_partial=dtype_partial,
+            head_dim=head_dim,
+            m_block_size=m_block_size,
+            k_block_size=k_block_size,
+            log_max_splits=log_max_splits,
+        )
+        # Check if implementation is supported
+        if not fa_combine.can_implement(
+            dtype,
+            dtype_partial,
+            head_dim,
+            m_block_size,
+            k_block_size,
+            log_max_splits,
+            num_threads=256,
+        ):
+            raise RuntimeError(
+                "FlashAttention combine kernel cannot be implemented with given parameters"
+            )
+        _flash_attn_fwd_combine.compile_cache[compile_key] = cute.compile(
+            fa_combine,
+            out_partial_tensor,
+            lse_partial_tensor,
+            out_tensor,
+            lse_tensor,
+            cu_seqlens_tensor,
+            seqused_tensor,
+            num_splits_dynamic_tensor,
+            semaphore_tensor,
+            current_stream,
+            options="--enable-tvm-ffi",
+        )
+    if not is_fake_mode():
+        _flash_attn_fwd_combine.compile_cache[compile_key](
+            out_partial,
+            lse_partial,
+            out,
+            lse,
+            cu_seqlens,
+            seqused,
+            num_splits_dynamic_ptr,
+            semaphore_to_reset,
+            current_stream,
+        )
+_flash_attn_fwd_combine.compile_cache = get_jit_cache("fwd_combine")
+def flash_attn_combine(
+    out_partial: torch.Tensor,
+    lse_partial: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    seqused: Optional[torch.Tensor] = None,
+    return_lse: bool = True,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """Flash Attention combine function for split attention computation.
+    Combines partial outputs and log-sum-exp values from multiple splits
+    of attention computation into final outputs. This is the main user-facing
+    interface for the combine kernel.
+    Args:
+        out_partial: Partial outputs tensor with shape:
+            - (num_splits, batch_size, seqlen, num_heads, head_size) for regular batched input
+            - (num_splits, total_q, num_heads, head_size) for variable length input
+        lse_partial: Partial LSE tensor with shape:
+            - (num_splits, batch_size, seqlen, num_heads) for regular batched input
+            - (num_splits, total_q, num_heads) for variable length input
+        out: Optional output tensor. If None, will be created automatically.
+        out_dtype: Optional output dtype. If None, will use fp16/bf16 based on input.
+        cu_seqlens: Cumulative sequence lengths for variable length sequences
+        seqused: Used sequence lengths for each batch
+        return_lse: Whether to return the combined LSE tensor. Default is True.
+    Returns:
+        Tuple of (out, lse) where:
+        - out: Combined output tensor with shape (batch_size, seqlen, num_heads, head_size)
+              or (total_q, num_heads, head_size) for varlen
+        - lse: Combined log-sum-exp tensor with shape (batch_size, seqlen, num_heads)
+              or (total_q, num_heads) for varlen. None if return_lse=False
+    Note:
+        This function expects the input tensors to be in the format produced by
+        split attention computation, where the first dimension is num_splits.
+        The permuting from user format to kernel format is now done inside the kernel.
+    """
+    # Input validation
+    assert out_partial.dim() in [4, 5], "out_partial must have 4 or 5 dimensions"
+    assert lse_partial.dim() in [3, 4], "lse_partial must have 3 or 4 dimensions"
+    assert out_partial.dtype == torch.float32, "out_partial must be fp32 (from accumulation)"
+    assert lse_partial.dtype == torch.float32, "lse_partial must be fp32"
+    # Determine if this is variable length based on dimensions
+    is_varlen = out_partial.dim() == 4
+    if is_varlen:
+        # Variable length: (num_splits, total_q, num_heads, head_size)
+        num_splits, total_q, num_heads, head_size = out_partial.shape
+        assert lse_partial.shape == (num_splits, total_q, num_heads), (
+            "lse_partial shape mismatch for varlen"
+        )
+        batch_size = 1  # Treat as single batch for varlen
+        seqlen = total_q
+    else:
+        # Regular batched: (num_splits, batch_size, seqlen, num_heads, head_size)
+        num_splits, batch_size, seqlen, num_heads, head_size = out_partial.shape
+        assert lse_partial.shape == (num_splits, batch_size, seqlen, num_heads), (
+            "lse_partial shape mismatch"
+        )
+    # Determine output dtype
+    if out_dtype is None:
+        out_dtype = out_partial.dtype
+    # Create output if not provided
+    device = out_partial.device
+    if out is None:
+        if is_varlen:
+            out = torch.empty(total_q, num_heads, head_size, dtype=out_dtype, device=device)
+        else:
+            out = torch.empty(
+                batch_size, seqlen, num_heads, head_size, dtype=out_dtype, device=device
+            )
+    # Create lse output only if requested
+    if return_lse:
+        if is_varlen:
+            lse = torch.empty(num_heads, total_q, dtype=torch.float32, device=device).transpose(
+                0, 1
+            )
+        else:
+            lse = torch.empty(
+                batch_size, num_heads, seqlen, dtype=torch.float32, device=device
+            ).transpose(1, 2)
+    else:
+        lse = None
+    _flash_attn_fwd_combine(
+        out_partial,
+        lse_partial,
+        out,
+        lse,
+        cu_seqlens,
+        seqused,
+    )
+    return out, lse

build/torch-cuda/mask.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Callable
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from .quack import layout_utils
+from . import utils
+from .seqlen_info import SeqlenInfoQK
+@cute.jit
+def mask_r2p(X: cute.Tensor, col_limit: Int32, arch: int = 90, rank1: bool = False) -> None:
+    # Bit manipulation, compiles down to the R2P instruction
+    # For sm100: we know that tScS_t2r[i][1] == i, for the particular tmem copy atom we're using.
+    # For sm90: instead of comparing limit to 0, 1, 8, 9, 16, 17, ...,
+    # we compare a transformed version of limit to 0, 1, 2, 3, 4, 5, ...
+    if const_expr(arch == 90):
+        col_limit_transformed = col_limit // 8 * 2 + min(col_limit % 8, 2)
+    else:
+        col_limit_transformed = col_limit
+    ncol = const_expr(cute.size(X.shape[cute.rank(X) - 1]) if not rank1 else cute.size(X.shape))
+    # Ideally we'd move by 32 instead of 24, but mask >> i isn't correct for i == 31
+    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
+        # Don't need to clamp to 32 since the shr.u32 instruction does that already
+        col_limit_right_s = max(col_limit_transformed - s * 24, 0)
+        # 0 -> 0b00...00, 1 -> 0b00...01, ..., 31 -> 0b01...11, 32 -> 0b11...11
+        mask = (1 << col_limit_right_s) - 1
+        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
+        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
+            in_bound = cutlass.Boolean(mask & (1 << i))
+            c = s * 24 + i
+            if const_expr(rank1):
+                X[c] = X[c] if in_bound else -Float32.inf
+                # This is the equivalent of:
+                # X[s * 24 + i] = X[s * 24 + i] if col_limit_right_s <= i else -Float32.inf
+            else:
+                for r in cutlass.range_constexpr(cute.size(X.shape[0])):
+                    X[r, c] = X[r, c] if in_bound else -Float32.inf
+@cute.jit
+def mask_r2p_transposed(X: cute.Tensor, row_limit_top: Int32, num_rep: int) -> None:
+    # Bit manipulation, compiles down to the R2P instruction
+    # For sm100: we know that tScS_t2r[i][0] has the form 0, 1, ..., 31, 64, ..., 127
+    # or 0, 1, ..., 15, 32, ..., 47, 64, ...
+    # We compare a transformed version of limit to 0, 1, 2, 3, 4, 5, ...
+    # Here we hardcode for the case of 2 warp groups.
+    num_wg = 2
+    row_limit_top_transformed = row_limit_top // (num_rep * num_wg) * num_rep + min(
+        row_limit_top % (num_rep * num_wg), num_rep
+    )
+    ncol = cute.size(X.shape)
+    # Ideally we'd move by 32 instead of 24, but mask >> i isn't correct for i == 31
+    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
+        row_limit_top_s = max(row_limit_top_transformed - s * 24, 0)
+        # 0 -> 0b00...00, 1 -> 0b00...01, ..., 31 -> 0b01...11, 32 -> 0b11...11
+        mask = (1 << row_limit_top_s) - 1
+        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
+        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
+            out_bound = cutlass.Boolean(mask & (1 << i))
+            c = s * 24 + i
+            X[c] = -Float32.inf if out_bound else X[c]
+            # tidx = cute.arch.thread_idx()[0] % 256
+            # if tidx == 128:
+            #     cute.printf("tidx = {}, s = {}, i = {}, row_limit_top = {}, row_limit_top_s = {}, mask = {}, out_bound = {}", tidx, s, i, row_limit_top, row_limit_top_s, mask, out_bound)
+@cute.jit
+def mask_r2p_dual_bound(
+    X: cute.Tensor,
+    col_limit_left: Int32,  # Inclusive lower bound
+    col_limit_right: Int32,  # Exclusive upper bound
+) -> None:
+    """
+    Dual-bound masking using two bitmasks for SM100, following mask_r2p.
+    Masks elements where: NOT (col_limit_left <= col < col_limit_right)
+    Uses bit manipulation to create a range mask:
+        mask_right = (1 << right) - 1  -> bits (right-1)..0 are 1
+        mask_left  = (1 << left) - 1   -> bits (left-1)..0 are 1
+        mask_range = mask_range = mask_right & ~ mask_left -> bits (right-1)..left are 1
+    """
+    ncol = const_expr(cute.size(X.shape))
+    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
+        right_s = max(col_limit_right - s * 24, 0)
+        left_s = max(col_limit_left - s * 24, 0)
+        # otherwise cute dsl complains about python int too large to convert into c long
+        right_s = min(right_s, 24)
+        left_s = min(left_s, 24)
+        # bits (right-1)..left are 1
+        mask_right = (1 << right_s) - 1
+        mask_left = (1 << left_s) - 1
+        mask_range = mask_right & ~mask_left
+        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
+        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
+            in_bound = cutlass.Boolean(mask_range & (1 << i))
+            c = s * 24 + i
+            X[c] = X[c] if in_bound else -Float32.inf
+@dataclass(frozen=True)
+class AttentionMask:
+    tile_m: cutlass.Constexpr[int]
+    tile_n: cutlass.Constexpr[int]
+    seqlen_info: SeqlenInfoQK
+    window_size_left: Optional[Int32] = None
+    window_size_right: Optional[Int32] = None
+    qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1  # only pass in if we're doing PackGQA
+    swap_AB: cutlass.Constexpr[bool] = False
+    @property
+    def seqlen_q(self) -> Int32:
+        return self.seqlen_info.seqlen_q
+    @property
+    def seqlen_k(self) -> Int32:
+        return self.seqlen_info.seqlen_k
+    @cute.jit
+    def apply_mask(
+        self,
+        acc_S: cute.Tensor,
+        batch_idx: cutlass.Int32,
+        head_idx: cutlass.Int32,
+        m_block: cutlass.Int32,
+        n_block: cutlass.Int32,
+        thr_mma: cute.TiledMma,
+        mask_seqlen: cutlass.Constexpr[bool],
+        mask_causal: cutlass.Constexpr[bool],
+        mask_local: cutlass.Constexpr[bool] = False,
+        mask_mod: cutlass.Constexpr[Optional[Callable]] = None,
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=(None, None),
+    ) -> None:
+        assert not (mask_causal and mask_local), "mask_causal and mask_local cannot be both True"
+        acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S, transpose=self.swap_AB)
+        acc_shape = (self.tile_m, self.tile_n)
+        cS = cute.make_identity_tensor(acc_shape if not self.swap_AB else acc_shape[::-1])
+        tScS_mn = layout_utils.reshape_acc_to_mn(thr_mma.partition_C(cS), transpose=self.swap_AB)
+        # We use t0ScS as these indices are known at compile time. We then must subtract the
+        # column limit by the thread column offset.
+        t0ScS_mn = layout_utils.reshape_acc_to_mn(
+            thr_mma.get_slice(0).partition_C(cS), transpose=self.swap_AB
+        )
+        ROW = 0 if const_expr(not self.swap_AB) else 1
+        COL = 1 if const_expr(not self.swap_AB) else 0
+        thr_col_offset = tScS_mn[0][COL]
+        # To handle edge cases of completely masked out rows where n_block_max = 0,
+        # we treat negative n_blocks as 0th n_block
+        # TODO: find more transparent solution
+        if n_block < 0:
+            n_block = 0
+        seqlenk_col_limit = self.seqlen_k - n_block * self.tile_n - thr_col_offset
+        if const_expr(not mask_causal and not mask_local and mask_mod is None):
+            if const_expr(mask_seqlen):
+                # The compiler now choses not to use R2P
+                r2p = const_expr(False and not self.swap_AB)
+                if const_expr(not r2p):
+                    # traverse column index.
+                    for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                        oob = t0ScS_mn[0, c][COL] >= seqlenk_col_limit
+                        for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
+                            acc_S_mn[r, c] = -Float32.inf if oob else acc_S_mn[r, c]
+                else:
+                    mask_r2p(acc_S_mn, seqlenk_col_limit, arch=90)
+        elif const_expr(
+            not mask_causal and not mask_local and mask_mod is not None
+        ):  # FlexAttention mask mod
+            nrow = const_expr(cute.size(tScS_mn.shape[0]))
+            ncol = const_expr(cute.size(tScS_mn.shape[1]))
+            has_fastdiv = const_expr(
+                fastdiv_mods is not None
+                and fastdiv_mods[0] is not None
+                and fastdiv_mods[1] is not None
+            )
+            wrap_aux_indices = const_expr(
+                has_fastdiv and mask_seqlen and const_expr(aux_tensors is not None)
+            )
+            for r in cutlass.range_constexpr(nrow):
+                # Respect swap_AB: ROW/COL determine which coordinate component corresponds to Q/KV.
+                local_row = tScS_mn[r, 0][ROW]
+                global_row_idx = local_row + m_block * self.tile_m
+                row_for_mod = global_row_idx
+                head_idx_for_mod = head_idx
+                if const_expr(self.qhead_per_kvhead_packgqa != 1):
+                    head_offset = global_row_idx % self.qhead_per_kvhead_packgqa
+                    head_idx_for_mod = head_idx * self.qhead_per_kvhead_packgqa + head_offset
+                    row_for_mod = global_row_idx // self.qhead_per_kvhead_packgqa
+                row_for_seqlen = row_for_mod
+                if const_expr(wrap_aux_indices):
+                    _, row_for_mod = divmod(row_for_mod, fastdiv_mods[0])
+                for col in cutlass.range_constexpr(ncol):
+                    col_idx_local = t0ScS_mn[0, col][COL]
+                    # Convert to absolute column index
+                    global_col_idx = thr_col_offset + col_idx_local + n_block * self.tile_n
+                    col_for_mod = global_col_idx
+                    if const_expr(wrap_aux_indices):
+                        _, col_for_mod = divmod(global_col_idx, fastdiv_mods[1])
+                    batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32)
+                    head_idx_ssa = utils.scalar_to_ssa(head_idx_for_mod, cutlass.Int32)
+                    q_idx_ssa = utils.scalar_to_ssa(row_for_mod, cutlass.Int32)
+                    kv_idx_ssa = utils.scalar_to_ssa(col_for_mod, cutlass.Int32)
+                    mask_value = mask_mod(
+                        batch_idx_ssa,
+                        head_idx_ssa,
+                        q_idx_ssa,
+                        kv_idx_ssa,
+                        self.seqlen_info,
+                        aux_tensors,
+                    )
+                    cond = cutlass.Boolean(utils.ssa_to_scalar(mask_value))
+                    if const_expr(mask_seqlen):
+                        out_of_bounds = (row_for_seqlen >= self.seqlen_q) or (
+                            global_col_idx >= self.seqlen_k
+                        )
+                        if out_of_bounds:
+                            acc_S_mn[r, col] = -cutlass.Float32.inf
+                        else:
+                            acc_S_mn[r, col] = acc_S_mn[r, col] if cond else -cutlass.Float32.inf
+                    else:
+                        acc_S_mn[r, col] = acc_S_mn[r, col] if cond else -cutlass.Float32.inf
+        else:  # Causal or local
+            if const_expr(not self.swap_AB):
+                # If PackGQA, we split the work of compute divmod among threads in the same row
+                threads_per_row = thr_mma.tv_layout_C.shape[0][0]
+                mma_m_idx = None
+                if const_expr(self.qhead_per_kvhead_packgqa != 1):
+                    assert not self.swap_AB, "swap_AB with PackGQA not supported yet"
+                    assert cute.arch.WARP_SIZE % threads_per_row == 0, (
+                        "threads_per_row must divide WARP_SIZE"
+                    )
+                    assert cute.size(acc_S_mn.shape[0]) <= threads_per_row
+                    tidx = thr_mma.thr_idx
+                    mma_m_idx = (
+                        m_block * self.tile_m + tScS_mn[tidx % threads_per_row, 0][0]
+                    ) // self.qhead_per_kvhead_packgqa
+                causal_row_offset = (
+                    1 + self.seqlen_k - n_block * self.tile_n - self.seqlen_q - thr_col_offset
+                )
+                if const_expr(mask_causal):
+                    r2p = const_expr(not self.swap_AB)  # R2P trick, see apply_mask_sm100
+                    for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
+                        # get the column index limit based on current row. Only consider the row index, so the column index sets to 0.
+                        if const_expr(self.qhead_per_kvhead_packgqa == 1):
+                            row_idx = tScS_mn[r, 0][0] + m_block * self.tile_m
+                        else:
+                            row_idx = utils.shuffle_sync(
+                                mma_m_idx, r % threads_per_row, width=threads_per_row
+                            )
+                        col_limit_right = row_idx + causal_row_offset
+                        if const_expr(mask_seqlen):
+                            col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
+                        if const_expr(not r2p):
+                            # traverse column index.
+                            for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                                acc_S_mn[r, c] = (
+                                    -Float32.inf
+                                    if t0ScS_mn[0, c][1] >= col_limit_right
+                                    else acc_S_mn[r, c]
+                                )
+                        else:
+                            mask_r2p(acc_S_mn[r, None], col_limit_right, arch=90, rank1=True)
+                else:  # Local
+                    local_row_offset_right = (
+                        causal_row_offset + self.window_size_right
+                        if const_expr(self.window_size_right is not None)
+                        else None
+                    )
+                    local_row_offset_left = (
+                        causal_row_offset - 1 - self.window_size_left
+                        if const_expr(self.window_size_left is not None)
+                        else None
+                    )
+                    for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
+                        if const_expr(self.qhead_per_kvhead_packgqa == 1):
+                            row_idx = tScS_mn[r, 0][0] + m_block * self.tile_m
+                        else:
+                            row_idx = utils.shuffle_sync(
+                                mma_m_idx, r % threads_per_row, width=threads_per_row
+                            )
+                        if const_expr(self.window_size_right is not None):
+                            col_limit_right = row_idx + local_row_offset_right
+                        else:
+                            col_limit_right = self.tile_n
+                        if const_expr(mask_seqlen):
+                            col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
+                        col_limit_left = (
+                            row_idx + local_row_offset_left
+                            if const_expr(self.window_size_left is not None)
+                            else 0
+                        )
+                        # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block = {}, r = {}, row_idx = {}, causal_row_offset = {}, col_limit_right = {}, col_limit_left = {}", n_block, r, row_idx, causal_row_offset, col_limit_right, col_limit_left)
+                        # traverse column index.
+                        for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                            col_idx = t0ScS_mn[0, c][1]
+                            # only consider the column index, so the row index sets to 0.
+                            if col_idx >= col_limit_right or col_idx < col_limit_left:
+                                acc_S_mn[r, c] = -Float32.inf
+            else:  # swap_AB
+                assert self.qhead_per_kvhead_packgqa == 1
+                thr_row_offset = tScS_mn[0][ROW]
+                causal_row_offset = (
+                    seqlenk_col_limit - self.seqlen_q + m_block * self.tile_m + thr_row_offset
+                )
+                if const_expr(mask_causal):
+                    for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                        col0 = t0ScS_mn[0, c][COL]
+                        # If col0 is beyond the column limit, we want to mask out the entire
+                        # column, by setting row limit to be self.tile_m.
+                        row_limit_top = (
+                            self.tile_m
+                            if col0 >= seqlenk_col_limit and mask_seqlen
+                            else col0 - causal_row_offset
+                        )
+                        for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
+                            acc_S_mn[r, c] = (
+                                -Float32.inf
+                                if t0ScS_mn[r, 0][ROW] < row_limit_top
+                                else acc_S_mn[r, c]
+                            )
+                else:
+                    for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                        col0 = t0ScS_mn[0, c][COL]
+                        # If col0 is beyond the column limit, we want to mask out the entire
+                        # column, by setting row limit to be self.tile_m.
+                        row_limit_top = (
+                            self.tile_m
+                            if col0 >= seqlenk_col_limit
+                            else col0 - causal_row_offset - self.window_size_right
+                        )
+                        # TODO: do we need col_limit_sink?
+                        row_limit_bot = col0 - causal_row_offset + self.window_size_left
+                        for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
+                            row_idx = t0ScS_mn[r, 0][ROW]
+                            acc_S_mn[r, c] = (
+                                -Float32.inf
+                                if row_idx < row_limit_top or row_idx > row_limit_bot
+                                else acc_S_mn[r, c]
+                            )
+    @cute.jit
+    def apply_mask_sm100(
+        self,
+        acc_S: cute.Tensor,
+        m_block: Int32,
+        n_block: Int32,
+        thr_mma: cute.TiledMma,
+        thr_tmem_load: cute.TiledCopy,
+        mask_seqlen: cutlass.Constexpr[bool],
+        mask_causal: cutlass.Constexpr[bool],
+        mask_local: cutlass.Constexpr[bool] = False,
+        mask_mod: cutlass.Constexpr[Optional[Callable]] = None,
+        batch_idx: Int32 = None,
+        head_idx: Int32 = None,
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=(None, None),
+        head_divmod=None,
+        check_q_boundary: bool = False,
+    ) -> None:
+        assert not (mask_causal and mask_local), "mask_causal and mask_local cannot be both True"
+        acc_shape = (self.tile_m, self.tile_n)
+        cS = cute.make_identity_tensor(acc_shape if not self.swap_AB else acc_shape[::-1])
+        tScS = thr_mma.partition_C(cS)
+        tScS = tScS[(None, None), 0, 0]
+        tScS_t2r = thr_tmem_load.partition_D(tScS)
+        # To handle edge cases of completely masked out rows where n_block_max = 0,
+        # we treat negative n_blocks as 0th n_block
+        # TODO: find more transparent solution
+        if n_block < 0:
+            n_block = 0
+        seqlenk_col_limit = self.seqlen_k - n_block * self.tile_n
+        r2p = True
+        if const_expr(not mask_causal and not mask_local and mask_mod is None):
+            if const_expr(mask_seqlen):
+                if const_expr(not r2p):
+                    for i in cutlass.range(cute.size(tScS_t2r.shape), unroll_full=True):
+                        # if tScS_t2r[i][1] >= seqlenk_col_limit:
+                        #     acc_S[i] = -Float32.inf
+                        # For some reason the 2 lines above generate really bad SASS
+                        acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= seqlenk_col_limit else acc_S[i]
+                else:
+                    mask_r2p(acc_S, seqlenk_col_limit, arch=100, rank1=True)
+        elif const_expr(not mask_causal and not mask_local and mask_mod is not None):
+            # Block sparse case w/ mask_mod
+            has_fastdiv = const_expr(
+                fastdiv_mods is not None
+                and fastdiv_mods[0] is not None
+                and fastdiv_mods[1] is not None
+            )
+            batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32)
+            ncol = const_expr(cute.size(tScS_t2r.shape))
+            for i in cutlass.range_constexpr(ncol):
+                row_coord = tScS_t2r[i][0] if not self.swap_AB else tScS_t2r[i][1]
+                col_coord = tScS_t2r[i][1] if not self.swap_AB else tScS_t2r[i][0]
+                global_row = row_coord + m_block * self.tile_m
+                global_col = col_coord + n_block * self.tile_n
+                if const_expr(self.qhead_per_kvhead_packgqa != 1):
+                    assert head_divmod is not None
+                    mask_row, head_offset = divmod(global_row, head_divmod)
+                    head_idx_for_mod = head_idx * self.qhead_per_kvhead_packgqa + head_offset
+                else:
+                    head_idx_for_mod = head_idx
+                    mask_row = global_row
+                mask_row_for_mod = mask_row
+                if const_expr(has_fastdiv and aux_tensors is not None):
+                    if check_q_boundary:
+                        _, mask_row_for_mod = divmod(mask_row, fastdiv_mods[0])
+                global_col_for_mod = global_col
+                if const_expr(has_fastdiv and mask_seqlen and aux_tensors is not None):
+                    _, global_col_for_mod = divmod(global_col, fastdiv_mods[1])
+                head_idx_ssa = utils.scalar_to_ssa(head_idx_for_mod, cutlass.Int32)
+                mask_row_ssa = utils.scalar_to_ssa(mask_row_for_mod, cutlass.Int32)
+                kv_idx_ssa = utils.scalar_to_ssa(global_col_for_mod, cutlass.Int32)
+                mask_value = mask_mod(
+                    batch_idx_ssa,
+                    head_idx_ssa,
+                    mask_row_ssa,
+                    kv_idx_ssa,
+                    self.seqlen_info,
+                    aux_tensors,
+                )
+                cond = cutlass.Boolean(utils.ssa_to_scalar(mask_value))
+                acc_S[i] = acc_S[i] if cond else -Float32.inf
+                if const_expr(mask_seqlen):
+                    acc_S[i] = -Float32.inf if global_col >= self.seqlen_k else acc_S[i]
+                if check_q_boundary:
+                    acc_S[i] = -Float32.inf if mask_row >= self.seqlen_q else acc_S[i]
+        else:  # Causal or local
+            causal_row_offset = 1 + self.seqlen_k - n_block * self.tile_n - self.seqlen_q
+            row_idx = tScS_t2r[0][0] + m_block * self.tile_m
+            if const_expr(self.qhead_per_kvhead_packgqa != 1):
+                row_idx = row_idx // self.qhead_per_kvhead_packgqa
+            if const_expr(mask_causal):
+                col_limit_right = row_idx + causal_row_offset
+                if const_expr(mask_seqlen):
+                    col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
+                # if cute.arch.thread_idx()[0] % 32 == 0:
+                #     cute.printf("tidx = %d, tidx tmem = %d, row_idx = %d, col_limit_right = %d, causal_row_offset = %d\n", cute.arch.thread_idx()[0], thr_tmem_load.thr_idx, row_idx, col_limit_right, causal_row_offset)
+                ncol = const_expr(cute.size(tScS_t2r.shape))
+                if const_expr(not r2p):
+                    for i in cutlass.range(ncol, unroll_full=True):
+                        acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= col_limit_right else acc_S[i]
+                else:
+                    mask_r2p(acc_S, col_limit_right, arch=100, rank1=True)
+            else:
+                local_row_offset_right = (
+                    causal_row_offset + self.window_size_right
+                    if const_expr(self.window_size_right is not None)
+                    else None
+                )
+                local_row_offset_left = (
+                    causal_row_offset - 1 - self.window_size_left
+                    if const_expr(self.window_size_left is not None)
+                    else None
+                )
+                if const_expr(self.window_size_right is not None):
+                    col_limit_right = row_idx + local_row_offset_right
+                else:
+                    col_limit_right = self.tile_n
+                if const_expr(mask_seqlen):
+                    col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
+                col_limit_left = (
+                    row_idx + local_row_offset_left
+                    if const_expr(self.window_size_left is not None)
+                    else 0
+                )
+                if const_expr(not r2p):
+                    # if cute.arch.thread_idx()[0] == 0 or cute.arch.thread_idx()[0] == 128: cute.printf("m_block = {}, n_block = {}, row_idx = {}, causal_row_offset = {}, col_limit_right = {}, col_limit_left = {}", m_block, n_block, row_idx, causal_row_offset, col_limit_right, col_limit_left)
+                    for i in cutlass.range(cute.size(tScS_t2r.shape), unroll_full=True):
+                        col_idx = tScS_t2r[i][1]
+                        acc_S[i] = (
+                            -Float32.inf
+                            if col_idx >= col_limit_right or col_idx < col_limit_left
+                            else acc_S[i]
+                        )
+                else:
+                    # XOR-based R2P dual bound masking
+                    mask_r2p_dual_bound(acc_S, col_limit_left, col_limit_right)
+    @cute.jit
+    def apply_mask_sm100_transposed(
+        self,
+        acc_S: cute.Tensor,
+        tScS_t2r: cute.Tensor,
+        t0ScS_t2r: cute.Tensor,
+        m_block: cutlass.Int32,
+        n_block: cutlass.Int32,
+        mask_seqlen: cutlass.Constexpr,
+        mask_causal: cutlass.Constexpr,
+        mask_local: cutlass.Constexpr,
+        mask_mod: cutlass.Constexpr[Optional[Callable]] = None,
+        batch_idx: Int32 = None,
+        head_idx: Int32 = None,
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=(None, None),
+        is_full_block: bool = False,
+        check_m_boundary: bool = True,
+    ) -> None:
+        """
+        Backward pass: mask S = K @ Q.T where n_block tiles seqlen_k and m_block tiles seqlen_q.
+        Coordinate conventio:
+        - ROW corresponds to Q (m_block)
+        - COL corresponds to KV (n_block)
+        is_full_block: If True, skip mask_mod (all elements valid). Only apply seqlen masking.
+        check_m_boundary: If False, skip seqlen_q boundary check (optimization for non-boundary m_blocks).
+                          When iterating m_blocks in forward order, only the last m_block may be partial.
+        """
+        assert not (mask_causal and mask_local), "mask_causal and mask_local cannot be both True"
+        ROW = 0 if const_expr(not self.swap_AB) else 1
+        COL = 1 if const_expr(not self.swap_AB) else 0
+        # assert t0ScS_t2r[0][COL] == 0, "col0 == 0" # tmp comment for 2-cta bwd
+        thr_col_offset = tScS_t2r[0][COL]
+        seqlenk_col_limit = self.seqlen_k - n_block * self.tile_n - thr_col_offset
+        if const_expr(not mask_causal and not mask_local and mask_mod is not None):
+            # Block sparse case with mask_mod (backward)
+            #
+            # Coordinate convention: ROW → Q (m_block), COL → KV (n_block).
+            # These already account for swap_AB.
+            #
+            # FULL blocks: mask_mod returns True for all elements, so skip it.
+            #   Still need seqlen bounds check (elements may be OOB on last m_block).
+            # PARTIAL blocks: apply mask_mod element-wise, then seqlen bounds.
+            if is_full_block:
+                if const_expr(mask_seqlen):
+                    if seqlenk_col_limit <= 0:
+                        # Entire tile is OOB for K
+                        for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
+                            acc_S[i] = -cutlass.Float32.inf
+                    elif check_m_boundary:
+                        # Last m_block: check Q and K boundaries
+                        ncol = const_expr(cute.size(tScS_t2r.shape))
+                        for i in cutlass.range_constexpr(ncol):
+                            row_coord = tScS_t2r[i][ROW]
+                            col_coord = tScS_t2r[i][COL]
+                            global_q = row_coord + m_block * self.tile_m
+                            global_kv = col_coord + n_block * self.tile_n
+                            q_out_of_bounds = global_q >= self.seqlen_q
+                            kv_out_of_bounds = global_kv >= self.seqlen_k
+                            out_of_bounds = q_out_of_bounds or kv_out_of_bounds
+                            acc_S[i] = -cutlass.Float32.inf if out_of_bounds else acc_S[i]
+            else:
+                # Partial block
+                has_fastdiv = const_expr(
+                    fastdiv_mods is not None
+                    and fastdiv_mods[0] is not None
+                    and fastdiv_mods[1] is not None
+                )
+                wrap_aux_indices = const_expr(
+                    has_fastdiv and mask_seqlen and const_expr(aux_tensors is not None)
+                )
+                batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32)
+                head_idx_ssa = utils.scalar_to_ssa(head_idx, cutlass.Int32)
+                ncol = const_expr(cute.size(tScS_t2r.shape))
+                for i in cutlass.range_constexpr(ncol):
+                    row_coord = tScS_t2r[i][ROW]
+                    col_coord = tScS_t2r[i][COL]
+                    global_q = row_coord + m_block * self.tile_m
+                    global_kv = col_coord + n_block * self.tile_n
+                    q_idx_for_mod = global_q
+                    kv_idx_for_mod = global_kv
+                    if const_expr(wrap_aux_indices):
+                        _, q_idx_for_mod = divmod(global_q, fastdiv_mods[0])
+                        _, kv_idx_for_mod = divmod(global_kv, fastdiv_mods[1])
+                    q_idx_ssa = utils.scalar_to_ssa(q_idx_for_mod, cutlass.Int32)
+                    kv_idx_ssa = utils.scalar_to_ssa(kv_idx_for_mod, cutlass.Int32)
+                    mask_value = mask_mod(
+                        batch_idx_ssa,
+                        head_idx_ssa,
+                        q_idx_ssa,
+                        kv_idx_ssa,
+                        self.seqlen_info,
+                        aux_tensors,
+                    )
+                    cond = cutlass.Boolean(utils.ssa_to_scalar(mask_value))
+                    acc_S[i] = acc_S[i] if cond else -cutlass.Float32.inf
+                    if const_expr(mask_seqlen):
+                        # check_m_boundary=False skips q check for non-boundary m_blocks
+                        q_out_of_bounds = check_m_boundary and (global_q >= self.seqlen_q)
+                        kv_out_of_bounds = global_kv >= self.seqlen_k
+                        out_of_bounds = q_out_of_bounds or kv_out_of_bounds
+                        acc_S[i] = -cutlass.Float32.inf if out_of_bounds else acc_S[i]
+        elif const_expr(not mask_causal and not mask_local):
+            if const_expr(mask_seqlen):
+                if seqlenk_col_limit <= 0:
+                    for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
+                        acc_S[i] = -cutlass.Float32.inf
+        else:  # Causal or local
+            thr_row_offset = tScS_t2r[0][ROW]
+            seqlenq_row_limit = self.seqlen_q - m_block * self.tile_m - thr_row_offset
+            causal_offset = seqlenq_row_limit - seqlenk_col_limit
+            if const_expr(mask_causal):
+                # tidx = cute.arch.thread_idx()[0] % 256
+                # if tidx < 32:
+                #     cute.printf("tidx = {}, {} {}, {} {}", tidx, tScS_t2r[0][0], tScS_t2r[0][1], tScS_t2r[1][0], tScS_t2r[1][1])
+                row_limit_top = causal_offset
+                if const_expr(mask_seqlen):
+                    # If col is beyond the column limit, we want to mask out the entire
+                    # column, by setting row limit to be self.tile_m.
+                    if seqlenk_col_limit <= 0:
+                        row_limit_top = self.tile_m
+                r2p = True
+                if const_expr(not r2p):
+                    for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
+                        acc_S[i] = (
+                            -cutlass.Float32.inf if t0ScS_t2r[i][ROW] < row_limit_top else acc_S[i]
+                        )
+                else:
+                    num_rep = cute.size(tScS_t2r, mode=[0])  # 16 or 32
+                    mask_r2p_transposed(acc_S, row_limit_top, num_rep)
+            else:
+                if const_expr(self.window_size_right is not None):
+                    row_limit_top = causal_offset - self.window_size_right
+                else:
+                    row_limit_top = 0
+                if const_expr(self.window_size_left is not None):
+                    row_limit_bot = causal_offset + self.window_size_left
+                if const_expr(mask_seqlen):
+                    if seqlenk_col_limit <= 0:
+                        row_limit_top = self.tile_m
+                for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
+                    row_idx = t0ScS_t2r[i][ROW]
+                    local_mask = row_idx < row_limit_top
+                    if const_expr(self.window_size_left is not None):
+                        local_mask |= row_idx > row_limit_bot
+                    acc_S[i] = -cutlass.Float32.inf if local_mask else acc_S[i]

build/torch-cuda/metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "version": 0,
+  "python-depends": [
+    "einops",
+    "tvm-ffi",
+    "nvidia-cutlass-dsl"
+  ]
+}

build/torch-cuda/mma_sm100_desc.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright (c) 2025, Tri Dao.
+# Ported Cutlass code from C++ to Python:
+# https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/mma_sm100_desc.hpp
+# https://github.com/NVIDIA/cutlass/blob/main/include/cute/atom/mma_traits_sm100.hpp
+from enum import IntEnum
+import cutlass
+import cutlass.cute as cute
+# ---------------------------------------------------------------------------
+# Enumerations that match the HW encodings (values MUST stay identical)
+# ---------------------------------------------------------------------------
+class Major(IntEnum):  # matrix “layout” in the ISA docs
+    K = 0
+    MN = 1
+class ScaleIn(IntEnum):  # negate flags
+    One = 0
+    Neg = 1
+class Saturate(IntEnum):
+    False_ = 0
+    True_ = 1
+class CFormat(IntEnum):  # 2-bit field (bits 4-5)
+    F16 = 0
+    F32 = 1
+    S32 = 2
+class F16F32Format(IntEnum):  # 3-bit field (A/B element type)
+    F16 = 0
+    BF16 = 1
+    TF32 = 2
+class S8Format(IntEnum):
+    UINT8 = 0
+    INT8 = 1
+class MXF8F6F4Format(IntEnum):
+    E4M3 = 0
+    E5M2 = 1
+    E2M3 = 3
+    E3M2 = 4
+    E2M1 = 5
+class MaxShift(IntEnum):
+    NoShift = 0
+    MaxShift8 = 1
+    MaxShift16 = 2
+    MaxShift32 = 3
+# ---------------------------------------------------------------------------
+# CUTLASS-type → encoding helpers
+# ---------------------------------------------------------------------------
+def to_UMMA_format(cutlass_type) -> int:
+    """
+    Map a CUTLASS scalar class to the 3-bit encoding for Matrix A/B.
+    """
+    if cutlass_type is cutlass.Int8:
+        return S8Format.INT8
+    # Unsigned 8-bit (if available in your CUTLASS build)
+    if cutlass_type is cutlass.Uint8:
+        return S8Format.UINT8
+    # FP-16 / BF-16
+    if cutlass_type is cutlass.Float16:
+        return F16F32Format.F16
+    if cutlass_type is cutlass.BFloat16:
+        return F16F32Format.BF16
+    # TensorFloat-32 (8-bit exponent, 10-bit mantissa packed in 19 bits)
+    if cutlass_type is cutlass.TFloat32:
+        return F16F32Format.TF32
+    # Float-8 / Float-6 / Float-4 – add whenever CUTLASS exposes them
+    if cutlass_type is cutlass.FloatE4M3FN:
+        return MXF8F6F4Format.E4M3
+    if cutlass_type is cutlass.FloatE5M2:
+        return MXF8F6F4Format.E5M2
+    raise TypeError(f"Unsupported CUTLASS scalar type for A/B: {cutlass_type!r}")
+def to_C_format(cutlass_type) -> int:
+    """
+    Map a CUTLASS scalar class to the 2-bit accumulator encoding.
+    """
+    if cutlass_type is cutlass.Float16:
+        return CFormat.F16
+    if cutlass_type is cutlass.Float32:
+        return CFormat.F32
+    if cutlass_type is cutlass.Int32:
+        return CFormat.S32
+    raise TypeError(f"Unsupported CUTLASS scalar type for accumulator: {cutlass_type!r}")
+# ---------------------------------------------------------------------------
+# The constructor – accepts only CUTLASS scalar classes
+# ---------------------------------------------------------------------------
+def make_instr_desc(
+    a_type,  # CUTLASS scalar class, e.g. cutlass.Int8
+    b_type,
+    c_type,
+    M: int,  # 64, 128 or 256
+    N: int,  # 8 … 256 (multiple of 8)
+    a_major: Major,
+    b_major: Major,
+    a_neg: ScaleIn = ScaleIn.One,
+    b_neg: ScaleIn = ScaleIn.One,
+    c_sat: Saturate = Saturate.False_,
+    is_sparse: bool = False,
+    max_shift: MaxShift = MaxShift.NoShift,
+) -> int:
+    """
+    Build the 32-bit instruction descriptor for Blackwell MMA.
+    All matrix/accumulator **types must be CUTLASS scalar classes** –
+    passing integers is forbidden.
+    """
+    # --- encode element formats -------------------------------------------------
+    a_fmt = int(to_UMMA_format(a_type))
+    b_fmt = int(to_UMMA_format(b_type))
+    c_fmt = int(to_C_format(c_type))
+    # --- range checks on M/N -----------------------------------------------------
+    if M not in (64, 128, 256):
+        raise ValueError("M must be 64, 128 or 256")
+    if N < 8 or N > 256 or (N & 7):
+        raise ValueError("N must be a multiple of 8 in the range 8…256")
+    m_dim = M >> 4  # 5-bit field
+    n_dim = N >> 3  # 6-bit field
+    # fmt: off
+    # --- pack the bit-fields -----------------------------------------------------
+    desc = 0
+    desc |= (0                 & 0x3) << 0        # sparse_id2 (always 0 here)
+    desc |= (int(is_sparse)    & 0x1) << 2        # sparse_flag
+    desc |= (int(c_sat)        & 0x1) << 3        # saturate
+    desc |= (c_fmt             & 0x3) << 4        # c_format
+    desc |= (a_fmt             & 0x7) << 7        # a_format
+    desc |= (b_fmt             & 0x7) << 10       # b_format
+    desc |= (int(a_neg)        & 0x1) << 13       # a_negate
+    desc |= (int(b_neg)        & 0x1) << 14       # b_negate
+    desc |= (int(a_major)      & 0x1) << 15       # a_major
+    desc |= (int(b_major)      & 0x1) << 16       # b_major
+    desc |= (n_dim             & 0x3F) << 17      # n_dim (6 bits)
+    desc |= (m_dim             & 0x1F) << 24      # m_dim (5 bits)
+    desc |= (int(max_shift)    & 0x3) << 30       # max_shift (2 bits)
+    # fmt: on
+    return desc & 0xFFFF_FFFF  # ensure 32-bit result
+def mma_op_to_idesc(op: cute.nvgpu.tcgen05.mma.MmaOp):
+    return make_instr_desc(
+        op.a_dtype,
+        op.b_dtype,
+        op.acc_dtype,
+        op.shape_mnk[0],
+        op.shape_mnk[1],
+        Major.K if op.a_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K else Major.MN,
+        Major.K if op.b_major_mode == cute.nvgpu.tcgen05.mma.OperandMajorMode.K else Major.MN,
+    )
+class LayoutType(IntEnum):  # occupies the top-3 bits [61:64)
+    SWIZZLE_NONE = 0  # (a.k.a. “INTERLEAVE” in older docs)
+    SWIZZLE_128B_BASE32B = 1
+    SWIZZLE_128B = 2
+    SWIZZLE_64B = 4
+    SWIZZLE_32B = 6
+    # values 3,5,7 are reserved / illegal for UMMA
+# ---------------------------------------------------------------------------
+#  Helpers – figure out the SWIZZLE_* family from the tensor layout
+# ---------------------------------------------------------------------------
+def _layout_type(swizzle: cute.Swizzle) -> LayoutType:
+    B, M, S = swizzle.num_bits, swizzle.num_base, swizzle.num_shift
+    if M == 4:  # Swizzle<*,4,3>
+        if S != 3:
+            raise ValueError("Unexpected swizzle shift – want S==3 for M==4")
+        return {
+            0: LayoutType.SWIZZLE_NONE,
+            1: LayoutType.SWIZZLE_32B,
+            2: LayoutType.SWIZZLE_64B,
+            3: LayoutType.SWIZZLE_128B,
+        }[B]  # KeyError ⇒ invalid B→ raise
+    if M == 5:  # Swizzle<2,5,2> (the only legal triple for M==5)
+        if (B, S) != (2, 2):
+            raise ValueError("Only Swizzle<2,5,2> supported for 128B_BASE32B")
+        return LayoutType.SWIZZLE_128B_BASE32B
+    # Any other (M,B,S) triple is not a UMMA-legal shared-memory layout
+    raise ValueError("Unsupported swizzle triple for UMMA smem descriptor")
+def make_smem_desc_base(layout: cute.Layout, swizzle: cute.Swizzle, major: Major) -> int:
+    """
+    Convert a 2-D *shared-memory* Cute layout into the Blackwell 64-bit
+    smem-descriptor, without the smem start address.
+    layout must correspond to layout of an uint128 tensor.
+    """
+    # ------------------------------------------------------------------ meta
+    layout_type = _layout_type(swizzle)  # resolve SWIZZLE_* family
+    VERSION = 1  # bits 46–47
+    LBO_MODE = 0  # bit  52
+    BASE_OFFSET = 0  # bits 49–51   (CUTLASS always 0)
+    # ---------------------------------------------------------- strides  (units: uint128_t = 16 B)
+    swizzle_atom_mn_size = {
+        LayoutType.SWIZZLE_NONE: 1,
+        LayoutType.SWIZZLE_32B: 2,
+        LayoutType.SWIZZLE_64B: 4,
+        LayoutType.SWIZZLE_128B: 8,
+        LayoutType.SWIZZLE_128B_BASE32B: 8,
+    }[layout_type]
+    if major is Major.MN:
+        swizzle_atom_k_size = 4 if layout_type is LayoutType.SWIZZLE_128B_BASE32B else 8
+        canonical_layout = cute.logical_divide(layout, (swizzle_atom_mn_size, swizzle_atom_k_size))
+        if not cute.is_congruent(canonical_layout, ((1, 1), (1, 1))):
+            raise ValueError("Not a canonical UMMA_MN Layout: Expected profile failure.")
+        stride_00 = canonical_layout.stride[0][0]
+        if layout_type is not LayoutType.SWIZZLE_NONE and stride_00 != 1:
+            raise ValueError("Not a canonical UMMA_MN Layout: Expected stride failure.")
+        stride_10 = canonical_layout.stride[1][0]
+        if stride_10 != swizzle_atom_mn_size:
+            raise ValueError("Not a canonical UMMA_MN Layout: Expected stride failure.")
+        stride_01, stride_11 = canonical_layout.stride[0][1], canonical_layout.stride[1][1]
+        if layout_type is LayoutType.SWIZZLE_NONE:
+            stride_byte_offset, leading_byte_offset = stride_01, stride_11
+        else:
+            stride_byte_offset, leading_byte_offset = stride_11, stride_01
+    else:
+        if layout_type == LayoutType.SWIZZLE_128B_BASE32B:
+            raise ValueError("SWIZZLE_128B_BASE32B is invalid for Major-K")
+        if not cute.size(layout.shape[0]) % 8 == 0:
+            raise ValueError("Not a canonical UMMA_K Layout: Expected MN-size multiple of 8.")
+        canonical_layout = cute.logical_divide(layout, (8, 2))
+        if not cute.is_congruent(canonical_layout, ((1, 1), (1, 1))):
+            raise ValueError("Not a canonical UMMA_K Layout: Expected profile failure.")
+        stride_00 = canonical_layout.stride[0][0]
+        if stride_00 != swizzle_atom_mn_size:
+            raise ValueError("Not a canonical UMMA_K Layout: Expected stride failure.")
+        stride_10 = canonical_layout.stride[1][0]
+        if layout_type is not LayoutType.SWIZZLE_NONE and stride_10 != 1:
+            raise ValueError("Not a canonical UMMA_K Layout: Expected stride failure.")
+        stride_01 = canonical_layout.stride[0][1]
+        stride_byte_offset, leading_byte_offset = stride_01, stride_10
+    # ------------------------------------------------------------------ pack
+    desc = 0
+    # leading_byte_offset_  [16:30)
+    desc |= (leading_byte_offset & 0x3FFF) << 16
+    # stride_byte_offset_   [32:46)
+    desc |= (stride_byte_offset & 0x3FFF) << 32
+    # version_             [46:48)
+    desc |= (VERSION & 0x3) << 46
+    # base_offset_         [49:52)
+    desc |= (BASE_OFFSET & 0x7) << 49
+    # lbo_mode_            [52:53)
+    desc |= (LBO_MODE & 0x1) << 52
+    # layout_type_         [61:64)
+    desc |= (int(layout_type) & 0x7) << 61
+    return desc & 0xFFFF_FFFF_FFFF_FFFF  # force 64-bit width
+def make_smem_desc_start_addr(start_addr: cute.Pointer) -> cutlass.Int32:
+    # 14 bits, remove 4 LSB (bits 0-13 in desc)
+    return (start_addr.toint() & 0x3FFFF) >> 4
+def smem_desc_base_from_tensor(sA: cute.Tensor, major: Major) -> int:
+    sA_swizzle = sA.iterator.type.swizzle_type
+    return make_smem_desc_base(
+        cute.recast_layout(128, sA.element_type.width, sA.layout[0]),
+        sA_swizzle,
+        major,
+    )

build/torch-cuda/named_barrier.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+import enum
+class NamedBarrierFwd(enum.IntEnum):
+    Epilogue = enum.auto()  # starts from 1 as barrier 0 is reserved for sync_threads()
+    WarpSchedulerWG1 = enum.auto()
+    WarpSchedulerWG2 = enum.auto()
+    WarpSchedulerWG3 = enum.auto()
+    PFull = enum.auto()
+    PEmpty = enum.auto()
+class NamedBarrierBwd(enum.IntEnum):
+    Epilogue = enum.auto()
+    WarpSchedulerWG1 = enum.auto()
+    WarpSchedulerWG2 = enum.auto()
+    WarpSchedulerWG3 = enum.auto()
+    PdS = enum.auto()
+    dQFullWG0 = enum.auto()
+    dQFullWG1 = enum.auto()
+    dQEmptyWG0 = enum.auto()
+    dQEmptyWG1 = enum.auto()
+class NamedBarrierBwdSm100(enum.IntEnum):
+    EpilogueWG1 = enum.auto()
+    EpilogueWG2 = enum.auto()
+    Compute = enum.auto()
+    dQaccReduce = enum.auto()
+    TmemPtr = enum.auto()

build/torch-cuda/pack_gqa.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2025, Tri Dao.
+import cutlass
+import cutlass.cute as cute
+from .quack import layout_utils
+from . import utils
+class PackGQA:
+    def __init__(
+        self,
+        m_block_size: cutlass.Constexpr[int],
+        head_dim_padded: cutlass.Constexpr[int],
+        check_hdim_oob: cutlass.Constexpr[bool],
+        qhead_per_kvhead: cutlass.Constexpr[bool],
+    ):
+        self.m_block_size = m_block_size
+        self.head_dim_padded = head_dim_padded
+        self.check_hdim_oob = check_hdim_oob
+        self.qhead_per_kvhead = qhead_per_kvhead
+    @cute.jit
+    def compute_ptr(
+        self,
+        tensor: cute.Tensor,
+        cRows: cute.Tensor,
+        tidx: cutlass.Int32,
+        block: cutlass.Int32,
+        threads_per_row: cutlass.Constexpr[int],
+        num_threads: cutlass.Constexpr[int],
+    ):
+        num_ptr_per_thread = cute.ceil_div(cute.size(cRows), threads_per_row)
+        tPrPtr = cute.make_fragment(num_ptr_per_thread, cutlass.Int64)
+        for i in cutlass.range_constexpr(num_ptr_per_thread):
+            row = i * num_threads + cRows[tidx % threads_per_row][0]
+            idx = block * self.m_block_size + row
+            m_idx = idx // self.qhead_per_kvhead
+            h_idx = idx - m_idx * self.qhead_per_kvhead
+            tPrPtr[i] = utils.elem_pointer(tensor, ((h_idx, m_idx),)).toint()
+        return tPrPtr
+    @cute.jit
+    def load_Q(
+        self,
+        mQ: cute.Tensor,  # ((qhead_per_kvhead, seqlen_q), headdim)
+        sQ: cute.Tensor,  # (m_block_size, head_dim_padded)
+        gmem_tiled_copy: cute.TiledCopy,
+        tidx: cutlass.Int32,
+        block: cutlass.Int32,
+        seqlen: cutlass.Int32,
+    ):
+        gmem_thr_copy = gmem_tiled_copy.get_slice(tidx)
+        cQ = cute.make_identity_tensor((self.m_block_size, self.head_dim_padded))
+        tQsQ = gmem_thr_copy.partition_D(sQ)
+        tQcQ = gmem_thr_copy.partition_S(cQ)
+        t0QcQ = gmem_thr_copy.get_slice(0).partition_S(cQ)
+        tQpQ = utils.predicate_k(tQcQ, limit=mQ.shape[1])
+        tQcQ_row = tQcQ[0, None, 0]
+        threads_per_row = gmem_tiled_copy.layout_tv_tiled.shape[0][0]
+        assert cute.arch.WARP_SIZE % threads_per_row == 0, "threads_per_row must divide WARP_SIZE"
+        num_threads = gmem_tiled_copy.size
+        tPrQPtr = self.compute_ptr(mQ[None, 0], tQcQ_row, tidx, block, threads_per_row, num_threads)
+        for m in cutlass.range_constexpr(cute.size(tQsQ.shape[1])):
+            q_ptr_i64 = utils.shuffle_sync(
+                tPrQPtr[m // threads_per_row], m % threads_per_row, width=threads_per_row
+            )
+            q_gmem_ptr = cute.make_ptr(
+                mQ.element_type, q_ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+            )
+            if (
+                t0QcQ[0, m, 0][0]
+                < seqlen * self.qhead_per_kvhead - block * self.m_block_size - tQcQ_row[0][0]
+            ):
+                mQ_cur = cute.make_tensor(q_gmem_ptr, (self.head_dim_padded,))
+                elems_per_load = cute.size(tQsQ.shape[0][0])
+                mQ_cur_copy = cute.tiled_divide(mQ_cur, (elems_per_load,))
+                for k in cutlass.range_constexpr(cute.size(tQsQ.shape[2])):
+                    ki = tQcQ[0, 0, k][1] // elems_per_load
+                    cute.copy(
+                        gmem_thr_copy,
+                        mQ_cur_copy[None, ki],
+                        tQsQ[None, m, k],
+                        pred=tQpQ[None, m, k] if cutlass.const_expr(self.check_hdim_oob) else None,
+                    )
+            # We don't need to clear the sQ smem tiles since we'll only write out the valid outputs
+    @cute.jit
+    def store_LSE(
+        self,
+        mLSE: cute.Tensor,  # (qhead_per_kvhead, seqlen_q)
+        tLSErLSE: cute.Tensor,  # (m_block_size, head_dim_padded)
+        tiled_mma: cute.TiledMma,
+        tidx: cutlass.Int32,
+        block: cutlass.Int32,
+        seqlen: cutlass.Int32,
+    ):
+        thr_mma = tiled_mma.get_slice(tidx)
+        caccO = cute.make_identity_tensor((self.m_block_size, self.head_dim_padded))
+        taccOcO = thr_mma.partition_C(caccO)
+        taccOcO_row = layout_utils.reshape_acc_to_mn(taccOcO)[None, 0]
+        assert cute.size(tLSErLSE) == cute.size(taccOcO_row)
+        threads_per_row = tiled_mma.tv_layout_C.shape[0][0]
+        assert cute.arch.WARP_SIZE % threads_per_row == 0, "threads_per_row must divide WARP_SIZE"
+        assert cute.size(tLSErLSE) <= threads_per_row
+        num_threads = tiled_mma.size
+        tPrLSEPtr = self.compute_ptr(mLSE, taccOcO_row, tidx, block, threads_per_row, num_threads)
+        for m in cutlass.range_constexpr(cute.size(tLSErLSE)):
+            lse_ptr_i64 = utils.shuffle_sync(
+                tPrLSEPtr[m // threads_per_row],
+                m % threads_per_row,
+                width=threads_per_row,
+            )
+            lse_gmem_ptr = cute.make_ptr(
+                mLSE.element_type, lse_ptr_i64, cute.AddressSpace.gmem, assumed_align=4
+            )
+            row = block * self.m_block_size + taccOcO_row[m][0]
+            # Only the thread corresponding to column 0 writes out the lse to gmem
+            if taccOcO[0][1] == 0 and row < seqlen * self.qhead_per_kvhead:
+                mLSE_copy = cute.make_tensor(lse_gmem_ptr, (1,))
+                mLSE_copy[0] = tLSErLSE[m]
+    @cute.jit
+    def store_O(
+        self,
+        mO: cute.Tensor,  # ((qhead_per_kvhead, seqlen_q), headdim)
+        tOrO: cute.Tensor,  # (m_block_size, head_dim_padded) split across threads according to gmem_tiled_copy
+        gmem_tiled_copy: cute.TiledCopy,
+        tidx: cutlass.Int32,
+        block: cutlass.Int32,
+        seqlen: cutlass.Int32,
+    ):
+        gmem_thr_copy = gmem_tiled_copy.get_slice(tidx)
+        cO = cute.make_identity_tensor((self.m_block_size, self.head_dim_padded))
+        tOcO = gmem_thr_copy.partition_S(cO)
+        t0OcO = gmem_thr_copy.get_slice(0).partition_S(cO)
+        tOpO = utils.predicate_k(tOcO, limit=mO.shape[1])
+        tOcO_row = tOcO[0, None, 0]
+        threads_per_row = gmem_tiled_copy.layout_tv_tiled.shape[0][0]
+        assert cute.arch.WARP_SIZE % threads_per_row == 0, "threads_per_row must divide WARP_SIZE"
+        num_threads = gmem_tiled_copy.size
+        tPrOPtr = self.compute_ptr(mO[None, 0], tOcO_row, tidx, block, threads_per_row, num_threads)
+        for m in cutlass.range_constexpr(cute.size(tOrO.shape[1])):
+            o_ptr_i64 = utils.shuffle_sync(
+                tPrOPtr[m // threads_per_row], m % threads_per_row, width=threads_per_row
+            )
+            o_gmem_ptr = cute.make_ptr(
+                mO.element_type, o_ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+            )
+            if (
+                t0OcO[0, m, 0][0]
+                < seqlen * self.qhead_per_kvhead - block * self.m_block_size - tOcO_row[0][0]
+            ):
+                mO_cur = cute.make_tensor(o_gmem_ptr, (self.head_dim_padded,))
+                elems_per_load = cute.size(tOrO.shape[0][0])
+                mO_cur_copy = cute.tiled_divide(mO_cur, (elems_per_load,))
+                for k in cutlass.range_constexpr(cute.size(tOrO.shape[2])):
+                    ki = tOcO[0, 0, k][1] // elems_per_load
+                    cute.copy(
+                        gmem_thr_copy,
+                        tOrO[None, m, k],
+                        mO_cur_copy[None, ki],
+                        pred=tOpO[None, m, k] if cutlass.const_expr(self.check_hdim_oob) else None,
+                    )

build/torch-cuda/paged_kv.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from typing import Type
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync
+from cutlass import Int32, const_expr
+from . import utils
+from .quack.cute_dsl_utils import ParamsBase
+from cutlass.cute import FastDivmodDivisor
+import math
+@dataclass
+class PagedKVManager(ParamsBase):
+    mPageTable: cute.Tensor
+    mK_paged: cute.Tensor
+    mV_paged: cute.Tensor
+    thread_idx: Int32
+    page_size_divmod: FastDivmodDivisor
+    seqlen_k: Int32
+    leftpad_k: Int32
+    n_block_size: Int32
+    num_threads: cutlass.Constexpr[Int32]
+    head_dim_padded: cutlass.Constexpr[Int32]
+    head_dim_v_padded: cutlass.Constexpr[Int32]
+    gmem_threads_per_row: cutlass.Constexpr[Int32]
+    page_entry_per_thread: Int32
+    async_copy_elems: Int32
+    gmem_tiled_copy_KV: cute.TiledCopy
+    gmem_thr_copy_KV: cute.TiledCopy
+    tPrPage: cute.Tensor
+    tPrPageOffset: cute.Tensor
+    tKpK: cute.Tensor
+    tVpV: cute.Tensor
+    @staticmethod
+    def create(
+        mPageTable: cute.Tensor,
+        mK_paged: cute.Tensor,
+        mV_paged: cute.Tensor,
+        page_size_divmod: FastDivmodDivisor,
+        bidb: Int32,
+        bidh: Int32,
+        thread_idx: Int32,
+        seqlen_k: Int32,
+        leftpad_k: Int32,
+        n_block_size: cutlass.Constexpr[Int32],
+        head_dim_padded: cutlass.Constexpr[Int32],
+        head_dim_v_padded: cutlass.Constexpr[Int32],
+        num_threads: cutlass.Constexpr[Int32],
+        dtype: Type[cutlass.Numeric],
+    ):
+        universal_copy_bits = 128
+        async_copy_elems = universal_copy_bits // dtype.width
+        dtype_bytes = dtype.width // 8
+        gmem_k_block_size = math.gcd(
+            head_dim_padded,
+            head_dim_v_padded,
+            128 // dtype_bytes,
+        )
+        assert gmem_k_block_size % async_copy_elems == 0
+        gmem_threads_per_row = gmem_k_block_size // async_copy_elems
+        assert cute.arch.WARP_SIZE % gmem_threads_per_row == 0
+        atom_async_copy = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            dtype,
+            num_bits_per_copy=universal_copy_bits,
+        )
+        thr_layout = cute.make_ordered_layout(
+            (num_threads // gmem_threads_per_row, gmem_threads_per_row),
+            order=(1, 0),
+        )
+        val_layout = cute.make_layout((1, async_copy_elems))
+        gmem_tiled_copy_KV = cute.make_tiled_copy_tv(atom_async_copy, thr_layout, val_layout)
+        gmem_thr_copy_KV = gmem_tiled_copy_KV.get_slice(thread_idx)
+        page_entry_per_thread = n_block_size // num_threads
+        tPrPage = cute.make_rmem_tensor((page_entry_per_thread,), Int32)
+        tPrPageOffset = cute.make_rmem_tensor((page_entry_per_thread,), Int32)
+        mPageTable = mPageTable[bidb, None]
+        mK_paged = mK_paged[None, None, bidh, None]
+        mV_paged = mV_paged[None, None, bidh, None]
+        cK = cute.make_identity_tensor((n_block_size, head_dim_padded))
+        tKcK = gmem_thr_copy_KV.partition_S(cK)
+        tKpK = utils.predicate_k(tKcK, limit=mK_paged.shape[1])
+        if const_expr(head_dim_padded == head_dim_v_padded):
+            tVpV = tKpK
+        else:
+            cV = cute.make_identity_tensor((n_block_size, head_dim_v_padded))
+            tVcV = gmem_thr_copy_KV.partition_S(cV)
+            tVpV = utils.predicate_k(tVcV, limit=mV_paged.shape[0])
+        return PagedKVManager(
+            mPageTable,
+            mK_paged,
+            mV_paged,
+            thread_idx,
+            page_size_divmod,
+            seqlen_k,
+            leftpad_k,
+            n_block_size,
+            num_threads,
+            head_dim_padded,
+            head_dim_v_padded,
+            gmem_threads_per_row,
+            page_entry_per_thread,
+            async_copy_elems,
+            gmem_tiled_copy_KV,
+            gmem_thr_copy_KV,
+            tPrPage,
+            tPrPageOffset,
+            tKpK,
+            tVpV,
+        )
+    @cute.jit
+    def load_page_table(self, n_block: Int32):
+        for i in cutlass.range(self.page_entry_per_thread, unroll=1):
+            row = (
+                i * self.num_threads
+                + (self.thread_idx % self.gmem_threads_per_row)
+                * (self.num_threads // self.gmem_threads_per_row)
+                + (self.thread_idx // self.gmem_threads_per_row)
+            )
+            row_idx = n_block * self.n_block_size + row
+            page_idx, page_offset = divmod(row_idx + self.leftpad_k, self.page_size_divmod)
+            is_valid = (
+                (i + 1) * self.num_threads <= self.n_block_size or row < self.n_block_size
+            ) and row_idx < self.seqlen_k
+            page = self.mPageTable[page_idx] if is_valid else 0
+            self.tPrPage[i] = page
+            self.tPrPageOffset[i] = page_offset
+    @cute.jit
+    def compute_X_ptr(self, K_or_V: str):
+        tPrXPtr = cute.make_rmem_tensor((self.page_entry_per_thread,), cutlass.Int64)
+        for i in cutlass.range(self.page_entry_per_thread, unroll=1):
+            page = self.tPrPage[i]
+            page_offset = self.tPrPageOffset[i]
+            if const_expr(K_or_V == "K"):
+                tPrXPtr[i] = utils.elem_pointer(self.mK_paged, (page_offset, 0, page)).toint()
+            else:
+                tPrXPtr[i] = utils.elem_pointer(self.mV_paged, (0, page_offset, page)).toint()
+        return tPrXPtr
+    @cute.jit
+    def load_KV(self, n_block: Int32, sX: cute.Tensor, K_or_V: str):
+        assert K_or_V in ("K", "V")
+        tPrXPtr = self.compute_X_ptr(K_or_V)
+        # Finesse sX layout to be (M, N).
+        sX_pi = cute.make_tensor(
+            sX.iterator,
+            cute.make_layout(
+                (sX.shape[0][0], (sX.shape[0][1], sX.shape[2])),
+                stride=(sX.stride[0][0], (sX.stride[0][1], sX.stride[2])),
+            ),
+        )
+        if const_expr(K_or_V == "V"):
+            # Need to transpose V
+            sX_pi = cute.make_tensor(sX_pi.iterator, cute.select(sX_pi.layout, mode=[1, 0]))
+        head_dim = self.head_dim_v_padded if const_expr(K_or_V == "V") else self.head_dim_padded
+        cX = cute.make_identity_tensor((self.n_block_size, head_dim))
+        tXsX = self.gmem_thr_copy_KV.partition_D(sX_pi)
+        tXcX = self.gmem_thr_copy_KV.partition_S(cX)
+        tXc0X = self.gmem_thr_copy_KV.get_slice(0).partition_S(cX)
+        seqlenk_row_limit = (
+            self.seqlen_k - n_block * self.n_block_size - tXcX[0][0] if n_block >= 0 else 0
+        )
+        for m in cutlass.range_constexpr(cute.size(tXsX, mode=[1])):
+            row_valid = tXc0X[0, m, 0][0] < seqlenk_row_limit
+            should_load = cute.make_fragment_like(tXsX[(0, None), m, 0], cute.Boolean)
+            should_load.fill(row_valid)
+            x_ptr_i64 = utils.shuffle_sync(
+                tPrXPtr[m // self.gmem_threads_per_row],
+                m % self.gmem_threads_per_row,
+                width=self.gmem_threads_per_row,
+            )
+            x_gmem_ptr = cute.make_ptr(
+                self.mK_paged.element_type, x_ptr_i64, cute.AddressSpace.gmem, assumed_align=16
+            )
+            mX_paged_cur = cute.make_tensor(x_gmem_ptr, cute.make_layout((head_dim,)))
+            mX_paged_cur_copy = cute.tiled_divide(mX_paged_cur, (self.async_copy_elems,))
+            for k in cutlass.range_constexpr(cute.size(tXsX, mode=[2])):
+                ki = tXcX[0, 0, k][1] // self.async_copy_elems
+                mX_paged_cur_copy_ki = mX_paged_cur_copy[None, ki]
+                tXsX_k = tXsX[None, m, k]
+                mX_paged_cur_copy_ki = cute.make_tensor(
+                    mX_paged_cur_copy_ki.iterator, tXsX_k.layout
+                )
+                cute.copy(
+                    self.gmem_tiled_copy_KV,
+                    mX_paged_cur_copy_ki,
+                    tXsX_k,
+                    pred=should_load,
+                )

build/torch-cuda/pipeline.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# Copyright (c) 2025, Tri Dao.
+# import math
+from typing import Optional
+from dataclasses import dataclass
+import cutlass.cute as cute
+from cutlass import Boolean, Int32, const_expr
+from cutlass.cutlass_dsl import if_generate, dsl_user_op
+from cutlass.pipeline import PipelineState
+from cutlass.pipeline import PipelineUserType
+from cutlass.pipeline import NamedBarrier as NamedBarrierOg
+from cutlass.pipeline import PipelineAsync as PipelineAsyncOg
+from cutlass.pipeline import PipelineTmaAsync as PipelineTmaAsyncOg
+from cutlass.pipeline import PipelineTmaUmma as PipelineTmaUmmaOg
+from cutlass.pipeline import PipelineUmmaAsync as PipelineUmmaAsyncOg
+from cutlass.pipeline import PipelineAsyncUmma as PipelineAsyncUmmaOg
+class PipelineStateSimple:
+    """
+    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
+    Use a single Int32 to store both the index and phase bit, then we use divmod to get the
+    index and phase. If stages is a power of 2, divmod turns into bit twiddling.
+    """
+    def __init__(self, stages: int, phase_index: Int32):
+        # assert stages < 2**16
+        # self._log_stages = int(math.log2(stages))
+        # assert 1 << self._log_stages == stages, "Number of stages must be a power of 2."
+        self._stages = stages
+        self._phase_index = phase_index
+    def clone(self) -> "PipelineStateSimple":
+        return PipelineStateSimple(self.stages, self._phase_index)
+    @property
+    def stages(self) -> int:
+        # return 1 << self._log_stages
+        return self._stages
+    @property
+    def index(self) -> Int32:
+        # return self._phase_index & 0xFFFF
+        # return self._phase_index & ((1 << self._log_stages) - 1)
+        if const_expr(self._stages == 1):
+            return Int32(0)
+        else:
+            return self._phase_index % self._stages
+    @property
+    def phase(self) -> Int32:
+        # return self._phase_index >> 16
+        # PTX docs say that the phase parity needs to be 0 or 1, so by right we need to
+        # take modulo 2. But in practice just passing the phase in without modulo works fine.
+        # return (self._phase_index >> self._log_stages) % 2
+        # return self._phase_index >> self._log_stages
+        if const_expr(self._stages == 1):
+            return self._phase_index
+        else:
+            return self._phase_index // self._stages
+    def advance(self):
+        if const_expr(self._stages == 1):
+            self._phase_index ^= 1
+        else:
+            self._phase_index += 1
+        # def then_body(phase_index):
+        #     # XOR the phase bit and set the index to 0
+        #     return (phase_index & 0xFFFF0000) ^ (1 << 16)
+        # def else_body(phase_index):
+        #     return phase_index
+        # self._phase_index = if_generate(
+        #     (self._phase_index & 0xFFFF) == self.stages,
+        #     then_body,
+        #     else_body,
+        #     [self._phase_index],
+        #     [Int32],
+        # )
+    def __extract_mlir_values__(self):
+        phase_index = self._phase_index
+        return [phase_index.ir_value()]
+    def __new_from_mlir_values__(self, values):
+        return PipelineStateSimple(self.stages, Int32(values[0]))
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        # return PipelineStateSimple(stages, Int32(1 << 16))
+        return PipelineStateSimple(stages, Int32(stages))
+    elif type is PipelineUserType.Consumer:
+        return PipelineStateSimple(stages, Int32(0))
+    else:
+        assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+@dataclass(frozen=True)
+class NamedBarrier(NamedBarrierOg):
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = NamedBarrierOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", NamedBarrier)
+        return obj
+    @dsl_user_op
+    def arrive_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
+        """
+        The aligned flavor of arrive is used when all threads in the CTA will execute the
+        same instruction. See PTX documentation.
+        """
+        cute.arch.barrier_arrive(
+            barrier_id=self.barrier_id + index,
+            number_of_threads=self.num_threads,
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def arrive_and_wait_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
+        cute.arch.barrier(
+            barrier_id=self.barrier_id + index,
+            number_of_threads=self.num_threads,
+            loc=loc,
+            ip=ip,
+        )
+@dataclass(frozen=True)
+class PipelineAsync(PipelineAsyncOg):
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineAsyncOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        # obj.__class__ = PipelineAsync
+        object.__setattr__(obj, "__class__", PipelineAsync)
+        return obj
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
+        self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        self.sync_object_empty.arrive(index, self.consumer_mask, loc=loc, ip=ip)
+@dataclass(frozen=True)
+class PipelineTmaAsync(PipelineTmaAsyncOg):
+    """
+    Override producer_acquire to take in extra_tx_count parameter.
+    """
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineTmaAsyncOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", PipelineTmaAsync)
+        return obj
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        extra_tx_count: int = 0,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        if const_expr(extra_tx_count == 0):
+            self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip)
+        else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            self.sync_object_full.arrive_and_expect_tx(state.index, tx_count, loc=loc, ip=ip)
+@dataclass(frozen=True)
+class PipelineTmaUmma(PipelineTmaUmmaOg):
+    """
+    Override producer_acquire to take in extra_tx_count parameter.
+    """
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineTmaUmmaOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        # obj.__class__ = PipelineTmaUmma
+        object.__setattr__(obj, "__class__", PipelineTmaUmma)
+        return obj
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        extra_tx_count: int = 0,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        if const_expr(extra_tx_count == 0):
+            if_generate(
+                self.is_leader_cta,
+                lambda: self.sync_object_full.arrive(
+                    state.index, self.producer_mask, loc=loc, ip=ip
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            if_generate(
+                self.is_leader_cta,
+                lambda: self.sync_object_full.arrive_and_expect_tx(
+                    state.index, tx_count, loc=loc, ip=ip
+                ),
+                loc=loc,
+                ip=ip,
+            )
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        if_generate(
+            self.is_leader_cta,
+            lambda: self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+        """
+        self.sync_object_empty.arrive(index, self.consumer_mask, self.cta_group, loc=loc, ip=ip)
+@dataclass(frozen=True)
+class PipelineUmmaAsync(PipelineUmmaAsyncOg):
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineUmmaAsyncOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", PipelineUmmaAsync)
+        return obj
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
+        """
+        UMMA producer commit buffer full, cta_group needs to be provided.
+        """
+        self.sync_object_full.arrive(index, self.producer_mask, self.cta_group, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        self.sync_object_empty.arrive(index, self.consumer_mask, loc=loc, ip=ip)
+@dataclass(frozen=True)
+class PipelineAsyncUmma(PipelineAsyncUmmaOg):
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineAsyncUmmaOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", PipelineAsyncUmma)
+        return obj
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
+        self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+        """
+        self.sync_object_empty.arrive(index, self.consumer_mask, self.cta_group, loc=loc, ip=ip)

build/torch-cuda/quack/__init__.py ADDED Viewed

File without changes

build/torch-cuda/quack/activation.py ADDED Viewed

	@@ -0,0 +1,568 @@

+# Copyright (c) 2025, Tri Dao.
+import math
+from typing import Tuple
+from functools import partial
+import cutlass.cute as cute
+from cutlass import Float32, Boolean, const_expr
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm, nvvm
+F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
+sub_packed_f32x2 = partial(
+    cute.arch.calc_packed_f32x2_op,
+    src_c=None,
+    calc_func=nvvm.sub_packed_f32x2,
+)
+@dsl_user_op
+def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "tanh.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
+        return 0.5 + 0.5 * tanh(0.5 * x)
+    else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return cute.arch.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
+@dsl_user_op
+def dsigmoid_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    # return dout * out * (1.0 - out)
+    return dout * (out - out * out)
+@dsl_user_op
+def relu(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0))
+    else:
+        return cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0))
+@dsl_user_op
+@cute.jit
+def drelu(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        return dout if x_pos else Float32(0.0), cute.arch.fmax(x, Float32(0.0))
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        dx = (dout[0] if x0_pos else Float32(0.0), dout[1] if x1_pos else Float32(0.0))
+        return dx, relu(x)
+@dsl_user_op
+def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * x
+    else:
+        relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
+        return cute.arch.mul_packed_f32x2(relu_x, x)
+@dsl_user_op
+@cute.jit
+def drelu_sq(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    """
+    ReLU squared backward pass: computes gradient w.r.t. x and recomputes forward
+    Given: relu_sq_out = max(x, 0) * x, and dout = grad w.r.t. relu_sq_out
+    Returns: (dx, relu_sq_out) where:
+    - dx = dout * 2 * x if x > 0, else 0
+    - relu_sq_out = max(x, 0) * x
+    """
+    if const_expr(not isinstance(x, tuple)):
+        relu_x = relu(x)
+        relu_sq_out = relu_x * x
+        # Derivative: d/dx[max(x,0) * x] = 2*x if x > 0, else 0
+        dx = 2.0 * (dout * relu_x)
+        return dx, relu_sq_out
+    else:
+        relu_x = relu(x)
+        relu_sq_out = cute.arch.mul_packed_f32x2(relu_x, x)
+        dx = cute.arch.mul_packed_f32x2((2.0, 2.0), cute.arch.mul_packed_f32x2(dout, relu_x))
+        return dx, relu_sq_out
+@dsl_user_op
+def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """
+    gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+            = 0.5 * x * (1 + tanh(x * (0.797885 + 0.0356774 * x * x)))
+    """
+    sqrt_2_over_pi = math.sqrt(2 / math.pi)  # ~0.797885
+    sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # ~0.0356774
+    if const_expr(not isinstance(x, tuple)):
+        return 0.5 * (
+            x
+            # Currently cute.math.tanh(x, fastmath=True) generates very slow code
+            # * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
+            * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
+        )
+    else:
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        x_tanh_z = cute.arch.fma_packed_f32x2(tanh_z, x, x)
+        return cute.arch.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
+@dsl_user_op
+def dgelu_tanh_approx(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    """
+    GELU tanh approximation backward pass: computes gradient w.r.t. x and recomputes forward
+    Given: gelu_out = 0.5 * x * (1 + tanh(x * (c1 + c2 * x^2))), and dout = grad w.r.t. gelu_out
+    Returns: (dx, gelu_out)
+    Derivative uses the chain rule:
+    d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+    where z = x * (c1 + c2 * x^2), dz/dx = c1 + 3 * c2 * x^2
+    and sech^2(z) = 1 - tanh^2(z)
+    """
+    sqrt_2_over_pi = math.sqrt(2 / math.pi)  # c1 ~0.797885
+    sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # c2 ~0.0356774
+    sqrt_2_over_pi_coeff_3 = 3.0 * sqrt_2_over_pi_coeff  # c3 ~0.01070322
+    if const_expr(not isinstance(x, tuple)):
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = x * x
+        # tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
+        tanh_z = tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq))
+        half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
+        gelu_out = x * half_tanh_z_plus_one
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = 1 - tanh_z * tanh_z
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = sqrt_2_over_pi + sqrt_2_over_pi_coeff_3 * x_sq
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        dgelu = half_tanh_z_plus_one + x * (0.5 * (sech2_z * dz_dx))
+        dx = dout * dgelu
+        return dx, gelu_out
+    else:
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        half_tanh_z_plus_one = cute.arch.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = cute.arch.mul_packed_f32x2(x, half_tanh_z_plus_one)
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = cute.arch.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = cute.arch.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        sech2_dz_dx = cute.arch.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = cute.arch.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = cute.arch.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        dx = cute.arch.mul_packed_f32x2(dout, dgelu)
+        return dx, gelu_out
+@dsl_user_op
+@cute.jit
+def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        use_linear = Boolean(x > 20.0)
+        return (
+            cute.math.log(Float32(cute.math.exp(x, fastmath=True)) + 1.0, fastmath=True)
+            if not use_linear
+            else x
+        )
+    else:
+        log2_e = math.log2(math.e)
+        x_log2e = cute.arch.mul_packed_f32x2(x, (log2_e, log2_e))
+        x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
+        x_exp_p1 = cute.arch.add_packed_f32x2(x_exp, (1.0, 1.0))
+        log_x_exp_p1 = (
+            cute.math.log2(x_exp_p1[0], fastmath=True),
+            cute.math.log2(x_exp_p1[1], fastmath=True),
+        )
+        ln2 = math.log(2.0)
+        softplus_x = cute.arch.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
+        use_linear_0 = Boolean(x[0] > 20.0)
+        use_linear_1 = Boolean(x[1] > 20.0)
+        return (
+            softplus_x[0] if not use_linear_0 else x[0],
+            softplus_x[1] if not use_linear_1 else x[1],
+        )
+@dsl_user_op
+@cute.jit
+def dsoftplus_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    use_linear = Boolean(out > 20.0)
+    # dx = dout * (1.0 - cute.math.exp(-out, fastmath=True)) if not use_linear else dout
+    dx = dout - dout * cute.math.exp(-out, fastmath=True)
+    return dx if not use_linear else dout
+@dsl_user_op
+def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) -> F32_or_F32x2:
+    """
+    silu(x) = x * sigmoid(x) = x * (1 + tanh(x / 2)) / 2 = (0.5 * x) * tanh(0.5 * x) + (0.5 * x)
+    This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
+    """
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x if const_expr(not already_halved) else x
+        # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
+        return x_half * tanh(x_half) + x_half
+    else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return cute.arch.fma_packed_f32x2(x_half, tanh_x_half, x_half)
+@dsl_user_op
+def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return silu(x) * y
+    else:
+        return cute.arch.mul_packed_f32x2(silu(x), y)
+@dsl_user_op
+def dswiglu(
+    x: F32_or_F32x2,
+    y: F32_or_F32x2,
+    dout: F32_or_F32x2,
+    *,
+    already_halved: bool = False,
+    loc=None,
+    ip=None,
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    SwiGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: swiglu_out = silu(x) * y, and dout = grad w.r.t. swiglu_out
+    Returns: (dx, dy, swiglu_out) where dx = dout * y * d_silu(x), dy = dout * silu(x)
+    d_silu(x) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+    This has been optimized to use fewer instructions (i.e. we expand things out
+    to use FFMA instead of FADD and FMUL).
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
+        # FMUL, MUFU.TANH, then FFMA
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = x * sigmoid_x  # FMUL
+        else:
+            tanh_x = tanh(x)  # MUFU.TANH
+            sigmoid_x = 0.5 * tanh_x + 0.5  # FFMA
+            silu_x = x * tanh_x + x  # FFMA
+        silu_x_dout = silu_x * dout  # FMUL
+        #   d_silu(x) * dout
+        # = sigmoid_x * (1 + x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + sigmoid_x * x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x - silu_x * sigmoid_x) * dout
+        # = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        d_silu_x_dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x_dout  # FFMA, FFMA
+        dx = d_silu_x_dout * y  # FMUL
+        dy = silu_x_dout
+        swiglu_out = silu_x * y  # FMUL
+        # Overall it's 1 MUFU.TANH, 5 FMUL, 3 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(x) and silu(x)
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_x)
+        else:
+            tanh_x = (tanh(x[0]), tanh(x[1]))
+            sigmoid_x = cute.arch.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = cute.arch.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
+        # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        sigmoid_x_minus_silu_x_sigmoid_x = cute.arch.fma_packed_f32x2(
+            sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
+        )
+        d_silu_x_dout = cute.arch.fma_packed_f32x2(
+            sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout
+        )
+        dx = cute.arch.mul_packed_f32x2(d_silu_x_dout, y)
+        dy = silu_x_dout
+        swiglu_out = cute.arch.mul_packed_f32x2(silu_x, y)
+        return dx, dy, swiglu_out
+@dsl_user_op
+def swiglu_oai(
+    x: F32_or_F32x2, y: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> F32_or_F32x2:
+    """The swiglu variant used in gpt-oss, which has a scaling factor on x and bias of 1 to y.
+    https://github.com/openai/gpt-oss/blob/7be9334950053a888e24887a57dac797a17d6e00/gpt_oss/torch/model.py#L249
+    x * sigmoid(alpha * x) * (y + 1)
+    Compile down to FMUL, FMUL, TANH, FFMA, FFMA
+    """
+    # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x
+        # silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
+        silu_x = x_half * tanh(alpha * x_half) + x_half
+        return silu_x * y + silu_x
+    else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2((alpha, alpha), x_half)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        silu_x = cute.arch.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
+@dsl_user_op
+def dswiglu_oai(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    Swiglu OAI backward pass: computes gradients w.r.t. x and y
+    Given: swiglu_oai_out = x * sigmoid(alpha * x) * (y + 1), and dout = grad w.r.t. swiglu_oai_out
+    Returns: (dx, dy, swiglu_oai_out)
+    Derivative of x * sigmoid(alpha * x) w.r.t. x:
+    d/dx[x * sigmoid(alpha * x)] = sigmoid(alpha * x) + alpha * x * sigmoid(alpha * x) * (1 - sigmoid(alpha * x))
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
+        alpha_x_half = (0.5 * alpha) * x  # FMUL
+        # MUFU.TANH, then FFMA
+        # sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
+        sigmoid_alpha_x = 0.5 + 0.5 * tanh(alpha_x_half)
+        silu_x = x * sigmoid_alpha_x  # FMUL
+        silu_x_dout = silu_x * dout  # FMUL
+        # FFMA, FFMA, FMUL
+        d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        dx = d_silu_x_dout * y + d_silu_x_dout  # FFMA, instead of multiply by y + 1
+        dy = silu_x_dout
+        swiglu_out = silu_x * y + silu_x  # FFMA, instead of multiply by y + 1
+        # Overall it's 1 MUFU.TANH, 4 FMUL, 5 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(alpha * x)
+        alpha_x_half = cute.arch.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        sigmoid_alpha_x = cute.arch.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
+        # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        silu_x_minus_product = cute.arch.fma_packed_f32x2(
+            silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
+        )
+        sigmoid_plus_alpha_diff = cute.arch.fma_packed_f32x2(
+            (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
+        )
+        d_silu_x_dout = cute.arch.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = cute.arch.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
+        dy = silu_x_dout
+        swiglu_out = cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
+        return dx, dy, swiglu_out
+@dsl_user_op
+def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """GLU: Gated Linear Unit
+    glu(x, y) = sigmoid(x) * y
+    Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
+    """
+    if const_expr(not isinstance(x, tuple)):
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        return sigmoid_x * y  # FMUL
+    else:
+        sigmoid_x = sigmoid(x)
+        return cute.arch.mul_packed_f32x2(sigmoid_x, y)
+@dsl_user_op
+def dglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    GLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: glu_out = sigmoid(x) * y, and dout = grad w.r.t. glu_out
+    Returns: (dx, dy, glu_out) where:
+    - dx = dout * y * sigmoid(x) * (1 - sigmoid(x))
+    - dy = dout * sigmoid(x)
+    - glu_out = sigmoid(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        sigmoid_x_dout = sigmoid_x * dout  # FMUL
+        glu_out = sigmoid_x * y  # FMUL
+        # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout
+        #    = y * (1 - sigmoid(x)) * sigmoid_x_dout
+        #    = (y - y * sigmoid(x)) * sigmoid_x_dout
+        #    = (y - glu_out) * sigmoid_x_dout
+        dx = (y - glu_out) * sigmoid_x_dout  # FADD, FMUL
+        dy = sigmoid_x_dout
+        # Total: 1 MUFU.TANH, 4 FMUL, 1 FADD, 1 FFMA
+        return dx, dy, glu_out
+    else:
+        sigmoid_x = sigmoid(x)
+        sigmoid_x_dout = cute.arch.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = cute.arch.mul_packed_f32x2(sigmoid_x, y)
+        # dx = (y - glu_out) * sigmoid_x_dout
+        y_minus_glu_out = sub_packed_f32x2(y, glu_out)
+        dx = cute.arch.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
+        dy = sigmoid_x_dout
+        return dx, dy, glu_out
+@dsl_user_op
+def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """ReGLU: ReLU Gated Linear Unit
+    reglu(x, y) = relu(x) * y = max(x, 0) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * y
+    else:
+        relu_x = relu(x)
+        return cute.arch.mul_packed_f32x2(relu_x, y)
+@dsl_user_op
+@cute.jit
+def dreglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    ReGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: reglu_out = relu(x) * y, and dout = grad w.r.t. reglu_out
+    Returns: (dx, dy, reglu_out) where:
+    - dx = dout * y if x > 0, else 0
+    - dy = dout * relu(x)
+    - reglu_out = relu(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        relu_x = cute.arch.fmax(x, Float32(0.0))
+        dx = (dout * y) if x_pos else Float32(0.0)
+        dy = dout * relu_x
+        reglu_out = relu_x * y
+        return dx, dy, reglu_out
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        relu_x = relu(x)
+        dout_y = cute.arch.mul_packed_f32x2(dout, y)
+        dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
+        dy = cute.arch.mul_packed_f32x2(dout, relu_x)
+        reglu_out = cute.arch.mul_packed_f32x2(relu_x, y)
+        return dx, dy, reglu_out
+@dsl_user_op
+def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """GeGLU: GELU Gated Linear Unit
+    geglu(x, y) = gelu(x) * y
+    Uses the tanh approximation of GELU
+    """
+    if const_expr(not isinstance(x, tuple)):
+        return gelu_tanh_approx(x) * y
+    else:
+        return cute.arch.mul_packed_f32x2(gelu_tanh_approx(x), y)
+@dsl_user_op
+def dgeglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    GeGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: geglu_out = gelu(x) * y, and dout = grad w.r.t. geglu_out
+    Returns: (dx, dy, geglu_out) where:
+    - dx = dout * y * d_gelu(x)
+    - dy = dout * gelu(x)
+    - geglu_out = gelu(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = dgelu_x_dout * y
+        dy = gelu_x * dout
+        geglu_out = gelu_x * y
+        return dx, dy, geglu_out
+    else:
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = cute.arch.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = cute.arch.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = cute.arch.mul_packed_f32x2(gelu_x, y)
+        return dx, dy, geglu_out
+# ============================================================================
+# Activation name -> function maps
+# ============================================================================
+act_fn_map = {
+    None: None,
+    "silu": silu,
+    "relu": relu,
+    "relu_sq": relu_sq,
+    "gelu_tanh_approx": gelu_tanh_approx,
+}
+dact_fn_map = {
+    None: None,
+    "relu": drelu,
+    "relu_sq": drelu_sq,
+    "gelu_tanh_approx": dgelu_tanh_approx,
+}
+gate_fn_map = {
+    "swiglu": swiglu,
+    "swiglu_oai": swiglu_oai,
+    "reglu": reglu,
+    "geglu": geglu,
+    "glu": glu,
+}
+dgate_fn_map = {
+    "swiglu": dswiglu,
+    "swiglu_oai": dswiglu_oai,
+    "reglu": dreglu,
+    "geglu": dgeglu,
+    "glu": dglu,
+}

build/torch-cuda/quack/compile_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Optional
+import cutlass.cute as cute
+def make_fake_tensor(dtype, shape, divisibility=1, leading_dim=-1) -> Optional[cute.Tensor]:
+    if leading_dim < 0:
+        leading_dim = len(shape) + leading_dim
+    if dtype is None:
+        return None
+    stride = tuple(
+        cute.sym_int64(divisibility=divisibility) if i != leading_dim else 1
+        for i in range(len(shape))
+    )
+    return cute.runtime.make_fake_tensor(
+        dtype, shape, stride=stride, assumed_align=divisibility * dtype.width // 8
+    )

build/torch-cuda/quack/copy_utils.py ADDED Viewed

	@@ -0,0 +1,1007 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Optional, Type, Tuple, Callable, Sequence
+from functools import partial
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int16, Boolean, const_expr
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
+from cutlass.cute.nvgpu.tcgen05.mma import CtaGroup  # noqa
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass.pipeline
+from cutlass._mlir.dialects import llvm
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
+Sm100MmaPeerBitMask = 0xFEFFFFFF
+@dsl_user_op
+def cvt_copy(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    retile: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    if const_expr(src.element_type != dst.element_type):
+        src_cvt = cute.make_rmem_tensor_like(src, dst.element_type)
+        src_cvt.store(src.load().to(dst.element_type))
+        src = src_cvt
+    if const_expr(retile):
+        src = tiled_copy.retile(src)
+    cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    dst = cute.make_rmem_tensor_like(src, src.element_type, loc=loc, ip=ip)
+    cute.autovec_copy(src, dst, loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def load_s2r_retile(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst_shape: cute.Tensor | cute.Shape,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    # Will also accept dst_shape being a tensor, in which case we write into that tensor
+    if const_expr(not isinstance(dst_shape, cute.Tensor)):
+        dst = cute.make_rmem_tensor(dst_shape, src.element_type, loc=loc, ip=ip)
+    else:
+        dst = dst_shape
+    cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def load_t2r(
+    thr_copy: cute.ThrCopy, shape: cute.Shape, src: cute.Tensor, *, loc=None, ip=None
+) -> cute.Tensor:
+    cDst = cute.make_identity_tensor(shape)
+    dst = cute.make_rmem_tensor(thr_copy.partition_D(cDst).shape, src.element_type, loc=loc, ip=ip)
+    cute.copy(thr_copy, src, dst, loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def get_copy_atom(
+    dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
+) -> cute.CopyAtom:
+    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+@dsl_user_op
+def copy(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    is_async: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    num_copy_elems = src.shape[0][0]
+    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async)
+    cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+def tiled_copy_1d(
+    dtype: Type[cutlass.Numeric], num_threads: int, num_copy_elems: int = 1, is_async: bool = False
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    thr_layout = cute.make_layout(num_threads)
+    val_layout = cute.make_layout(num_copy_elems)
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def tiled_copy_2d(
+    dtype: Type[cutlass.Numeric],
+    threads_per_row: int,
+    num_threads: int,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    assert num_threads % threads_per_row == 0
+    thr_layout = cute.make_ordered_layout(
+        (num_threads // threads_per_row, threads_per_row),
+        order=(1, 0),
+    )
+    val_layout = cute.make_layout((1, num_copy_elems))
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+@cute.jit
+def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
+    # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
+    tApA = cute.make_rmem_tensor(
+        cute.make_layout(
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            stride=(cute.size(tAcA, mode=[2]), 0, 1),
+        ),
+        Boolean,
+    )
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+    return tApA
+# def tiled_copy_2d(
+#     dtype: Type[cutlass.Numeric], major_mode_size: int, num_threads: int, is_async: bool = False
+# ) -> cute.TiledCopy:
+#     num_copy_bits = math.gcd(major_mode_size, 128 // dtype.width) * dtype.width
+#     copy_elems = num_copy_bits // dtype.width
+#     copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+#     copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+#     gmem_threads_per_row = major_mode_size // copy_elems
+#     assert num_threads % gmem_threads_per_row == 0
+#     thr_layout = cute.make_ordered_layout(
+#         (num_threads // gmem_threads_per_row, gmem_threads_per_row),
+#         order=(1, 0),
+#     )
+#     val_layout = cute.make_layout((1, copy_elems))
+#     return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+# Ragged tensor trick for TMA: encodes variable-length sequences into a higher-rank
+# tensor so that TMA's out-of-bounds checking handles sequence boundaries.
+#
+# Given a tensor T with a ragged dimension (variable-length across batches), we create
+# a higher-rank tensor where the ragged dim is replaced with a fixed size `big_int`, and
+# extra dim(s) are appended. When indexing into a specific sequence at (offset, length),
+# `offset_ragged_tensor` computes coordinates such that:
+#   ragged_coord = big_int - length   (OOB check clamps reads past the sequence end)
+#   extra_coord(s) = f(offset, length) (selects the correct memory region)
+#
+# ptr_shift=True: 1-extra-dim approach (adds 1 dim, supports up to 4D input):
+#   Shape:  (*before, big_int, *after, max_int)
+#   Stride: (*original_strides, stride_r)     where stride_r = T.stride[ragged_dim]
+#   Pointer shifted backward by big_int * stride_r elements.
+#   Address for coords (big_int - length) in ragged dim, (offset + length) in extra dim:
+#     addr = (base - big_int * s_r) + (big_int - length) * s_r + (offset + length) * s_r
+#          = base + offset * s_r                                                      [correct]
+#   Works for epilogue TMA store. Does NOT work for TMA load with large big_int
+#   — the shifted pointer must land in physically mapped GPU memory.
+#
+# ptr_shift=False: 2-extra-dim approach (adds 2 dims, supports up to 3D input):
+#   Shape:  (*before, big_int, *after, max_int, max_int)
+#   Stride: (*before_strides, stride_r, *after_strides, 2^34 - stride_r, stride_r)
+#   No pointer shift. Uses 64-bit address wraparound to cancel the ragged offset.
+#   Let W = 2^34 - stride_r. Address for coords (big_int - length) in ragged dim,
+#   big_int in extra dim 0, (offset + length) in extra dim 1:
+#     addr = base + (big_int - length) * s_r + big_int * W + (offset + length) * s_r
+#          = base + big_int * (s_r + W) - length * s_r + (offset + length) * s_r
+#          = base + big_int * 2^34 + offset * s_r
+#   Since big_int = 2^30: big_int * 2^34 = 2^64 ≡ 0 (mod 2^64), so:
+#     addr = base + offset * s_r                                                      [correct]
+#   Works for all TMA paths since the base pointer is never shifted.
+#
+# Ragged tensor was adapted from the implementation from Triton, but here we have an option that
+# only needs 1 extra dimension instead of 2.
+# https://github.com/triton-lang/triton/blob/main/python/triton/tools/ragged_tma.py
+BIG_INT = 2**30
+MAX_INT = 2**31 - 1
+BIG_INT_INV = 2**64 // BIG_INT
+@dsl_user_op
+def create_ragged_tensor_for_tma(
+    T: cute.Tensor,
+    ragged_dim: int = 0,
+    ptr_shift: bool = False,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    rank = cute.rank(T)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    if ptr_shift:
+        assert rank <= 4, "ptr_shift ragged tensor only supports up to 4 dimensions"
+        new_shape = T.shape[:ragged_dim] + (BIG_INT,) + T.shape[ragged_dim + 1 :] + (MAX_INT,)
+        new_stride = T.stride + (T.stride[ragged_dim],)
+        ptr_offset = (None,) * ragged_dim + (-BIG_INT,) + (None,) * (rank - ragged_dim - 1)
+        new_ptr = cute.domain_offset(ptr_offset, T).iterator
+        return cute.make_tensor(new_ptr, cute.make_layout(new_shape, stride=new_stride))
+    else:
+        assert rank <= 3, "non-ptr_shift ragged tensor only supports up to 3 dimensions"
+        stride_r = T.stride[ragged_dim]
+        new_shape = (
+            T.shape[:ragged_dim] + (BIG_INT,) + T.shape[ragged_dim + 1 :] + (MAX_INT, MAX_INT)
+        )
+        new_stride = (
+            T.stride[:ragged_dim]
+            + (stride_r,)
+            + T.stride[ragged_dim + 1 :]
+            + (BIG_INT_INV - stride_r, stride_r)
+        )
+        return cute.make_tensor(T.iterator, cute.make_layout(new_shape, stride=new_stride))
+@dsl_user_op
+def offset_ragged_tensor(
+    T: cute.Tensor,
+    offset: Int32,
+    length: Int32,
+    ragged_dim: int = 0,
+    ptr_shift: bool = False,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    rank = cute.rank(T)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    big_int = cute.size(T, mode=[ragged_dim])
+    offset_val = big_int - length
+    if ptr_shift:
+        # 1-extra-dim: rank = original_rank + 1
+        assert rank >= ragged_dim + 2
+        offset_tuple = (None,) * ragged_dim + (offset_val,) + (None,) * (rank - ragged_dim - 2)
+        index_tuple = (None,) * (rank - 1) + (offset + length,)
+    else:
+        # 2-extra-dim: rank = original_rank + 2, last 2 modes are the wraparound dims
+        assert rank >= ragged_dim + 3
+        offset_tuple = (None,) * ragged_dim + (offset_val,) + (None,) * (rank - ragged_dim - 3)
+        index_tuple = (None,) * (rank - 2) + (big_int, offset + length)
+    return cute.domain_offset(offset_tuple, T[index_tuple])
+def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
+    bit_msk = (1 << b) - 1
+    yyy_msk = bit_msk << (m + s)
+    return ptr_int ^ ((ptr_int & yyy_msk) >> s)
+def swizzle_ptr(ptr: cute.Pointer):
+    swz = ptr.type.swizzle_type
+    ptr_int = swizzle_int(ptr.toint(), swz.num_bits, swz.num_base, swz.num_shift)
+    return cute.make_ptr(ptr.dtype, ptr_int, ptr.memspace, assumed_align=ptr.alignment)
+def as_position_independent_swizzle_tensor(tensor: cute.Tensor) -> cute.Tensor:
+    outer = tensor.layout
+    width = tensor.element_type.width
+    swizzle_type = tensor.iterator.type.swizzle_type
+    inner = cute.make_swizzle(swizzle_type.num_bits, swizzle_type.num_base, swizzle_type.num_shift)
+    # Need to recast the swizzle from byte (e.g. <3, 4, 3> to element units (e.g. <3, 3, 3> for
+    # for 16 bits and <3, 2, 3> for 32 bits)
+    new_layout = cute.recast_layout(
+        width, 8, cute.make_composed_layout(inner, 0, cute.recast_layout(8, width, outer))
+    )
+    # recast_ptr to remove the pointer swizzle
+    return cute.make_tensor(cute.recast_ptr(tensor.iterator, dtype=tensor.element_type), new_layout)
+def partition_D_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_D(tensor).iterator),
+        thr_copy.partition_D(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+def partition_S_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_S(tensor).iterator),
+        thr_copy.partition_S(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+@dsl_user_op
+def sm90_get_smem_load_op(
+    layout_c: cutlass.utils.LayoutEnum,
+    elem_ty_c: Type[cutlass.Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """
+    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.
+    Parameters:
+    -----------
+    layout_c : LayoutEnum
+        The layout enum of the output tensor D.
+    elem_ty_c : Type[Numeric]
+        The element type for output tensor D.
+    Returns:
+    --------
+    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
+    """
+    if not isinstance(elem_ty_c, cutlass.cutlass_dsl.NumericMeta):
+        raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
+    is_m_major = layout_c.is_m_major_c()
+    if elem_ty_c.width == 16:
+        return cute.make_copy_atom(warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip)
+    else:
+        return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
+def get_smem_store_atom(
+    arch: cutlass.Constexpr[int],
+    element_type: Type[cute.Numeric],
+    transpose: bool = False,
+    major_mode_size: Optional[int] = None,
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        num_matrices = (
+            4
+            if major_mode_size is None or major_mode_size % 16 == 0
+            else (2 if major_mode_size % 8 == 0 else 1)
+        )
+        return cute.make_copy_atom(
+            warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=num_matrices),
+            element_type,
+        )
+def get_smem_load_atom(
+    arch: cutlass.Constexpr[int],
+    element_type: Type[cute.Numeric],
+    transpose: bool = False,
+    major_mode_size: Optional[int] = None,
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        num_matrices = (
+            4
+            if major_mode_size is None or major_mode_size % 16 == 0
+            else (2 if major_mode_size % 8 == 0 else 1)
+        )
+        return cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=num_matrices),
+            element_type,
+        )
+def get_smem_store_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+    major_mode_size: Optional[int] = None,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_store_atom(arch, dtype, transpose, major_mode_size=major_mode_size)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sC = thr_copy.partition_D(sC)
+    else:
+        tRS_sC = partition_D_position_independent(thr_copy, sC)
+    def copy_fn(src: cute.Tensor, dst_idx: Optional[Int32] = None, **new_kwargs):
+        dst_tensor = tRS_sC if const_expr(dst_idx is None) else tRS_sC[None, None, None, dst_idx]
+        cvt_copy(tiled_copy, src, dst_tensor, retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sC
+def get_smem_load_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sC = thr_copy.partition_S(sC)
+    else:
+        tSR_sC = partition_S_position_independent(thr_copy, sC)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
+    def copy_fn(src_idx: Optional[Int32] = None, **new_kwargs):
+        src_tensor = tSR_sC if const_expr(src_idx is None) else tSR_sC[None, None, None, src_idx]
+        return load_s2r_retile(tiled_copy, src_tensor, dst_shape=tRS_shape, **new_kwargs)
+    return copy_fn, thr_copy, tSR_sC
+def epilog_smem_copy_atom(
+    tiled_mma: cute.TiledMma, epi_tile: cute.Shape, transpose: bool = False
+) -> cute.TiledCopy:
+    copy_atom_C = cute.make_copy_atom(
+        warp.StMatrix8x8x16bOp(transpose, num_matrices=4 if epi_tile[1] % 16 == 0 else 2),
+        cutlass.Float16,  # this is just to get the right source layout
+    )
+    tiled_copy_C_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
+    return tiled_copy_C_atom
+def get_smem_store_epi(
+    tiled_mma: cute.TiledMma,
+    epi_tile: cute.Shape,
+    sC: Optional[cute.Tensor],
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor, cute.Tensor]:
+    dtype = sC.element_type if const_expr(sC is not None) else cutlass.Float16
+    tiled_copy_C_atom = epilog_smem_copy_atom(tiled_mma, epi_tile)
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_S(copy_atom, tiled_copy_C_atom)
+    thr_copy = tiled_copy.get_slice(tidx)
+    tRS_sC = None
+    if const_expr(sC is not None):
+        if const_expr(not position_independent):
+            tRS_sC = thr_copy.partition_D(sC)
+        else:
+            tRS_sC = partition_D_position_independent(thr_copy, sC)
+    sC_shape = sC.shape[:2] if sC is not None else epi_tile
+    # (R2S, R2S_M, R2S_N, PIPE_C)
+    tRS_rC_shape = thr_copy.partition_S(cute.make_identity_tensor(sC_shape)).shape
+    tRS_rC = cute.make_rmem_tensor(tRS_rC_shape, tiled_mma.op.acc_dtype)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], **new_kwargs)
+    return copy_fn if const_expr(sC is not None) else None, thr_copy, tRS_sC, tRS_rC
+def get_smem_store_A(
+    tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sA = thr_copy.partition_D(sA)
+    else:
+        tRS_sA = partition_D_position_independent(thr_copy, sA)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sA[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sA
+def get_smem_load_A(
+    tiled_mma: cute.TiledMma,
+    sA: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    with_dst_tensor: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sA = thr_copy.partition_S(sA)
+    else:
+        tSR_sA = partition_S_position_independent(thr_copy, sA)
+    tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sA[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    def copy_fn_w_dst_tensor(src_idx: Int32, dst: cute.Tensor, **new_kwargs):
+        return load_s2r_retile(tiled_copy, tSR_sA[None, None, None, src_idx], dst, **new_kwargs)
+    return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
+@dsl_user_op
+def cpasync_reduce_bulk_add_f32(
+    smem_ptr: cute.Pointer,
+    gmem_ptr: cute.Pointer,
+    store_bytes: int | Int32,
+    *,
+    loc=None,
+    ip=None,
+):
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    # cache_hint = cutlass.Int64(0x14F0000000000000)  # EVICT_LAST
+    llvm.inline_asm(
+        None,
+        [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value()],
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;",
+        "l,r,r",
+        # [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value(), cache_hint.ir_value()],
+        # "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [$0], [$1], $2, $3;",
+        # "l,r,r,l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def get_tma_desc_addr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    Get the address of the TMA descriptor embedded in a TMA Copy Atom.
+    Extracts the constant memory address of the TMA descriptor for use with
+    custom PTX instructions.
+    :param tma_atom: TMA Copy Atom from make_tiled_tma_atom
+    :return: Pointer to TMA descriptor in constant memory
+    Example:
+        >>> desc_ptr = get_tma_descriptor_address(tma_atom)
+    """
+    exec_atom = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    tma_desc_ptr_type = ir.Type.parse(
+        "!cute.ptr<!cute_nvgpu.tma_descriptor_tiled, generic, align<128>>"
+    )
+    return _cute_nvgpu_ir.get_tma_desc_addr(tma_desc_ptr_type, exec_atom, loc=loc, ip=ip)
+@dsl_user_op
+def tma_gather4_load(
+    tma_desc_ptr: cute.Pointer,
+    dst_smem_ptr: cute.Pointer,
+    mbarrier_ptr: cute.Pointer,
+    col_idx: Int32,
+    row_indices: Sequence[Int32],
+    *,
+    num_cta: int = 1,
+    multicast_mask=None,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform TMA gather4 load from global memory to shared memory.
+    Issues PTX instruction:
+    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
+        [dstMem], [tensorMap, {col_idx, row0, row1, row2, row3}], [smem_bar];
+    This loads 4 rows (specified by row_indices) from a 2D tensor at the given
+    column index into shared memory, using the TMA descriptor.
+    :param tma_desc_ptr: Pointer to TMA descriptor in constant memory (128-byte aligned)
+    :type tma_desc_ptr:  Pointer
+    :param dst_smem_ptr: Destination address in shared memory
+    :type dst_smem_ptr:  Pointer
+    :param mbarrier_ptr: Pointer to mbarrier in shared memory for completion tracking
+    :type mbarrier_ptr:  Pointer
+    :param col_idx:      Column index
+    :type col_idx:       Int32
+    :param row_indices:  Sequence of exactly 4 row indices
+    :type row_indices:   Sequence[Int32]
+    :param num_cta:      Number of CTAs participating (default: 1)
+    :type num_cta:       int
+    :param multicast_mask: Optional multicast mask
+    :type multicast_mask: Int16
+    Requirements:
+        - row_indices must contain exactly 4 elements
+        - Compute capability >= SM_100 (Blackwell)
+        - TMA descriptor must be properly initialized for 2D tensor
+    Example:
+        >>> from cutlass.cute.nvgpu import cpasync
+        >>> from cutlass.cute import core
+        >>>
+        >>> # Create TMA descriptor
+        >>> tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(...)
+        >>> tma_desc_ptr = get_tma_descriptor_address(tma_atom)
+        >>>
+        >>> # Compute indices (typically from kernel logic)
+        >>> col_idx = core.get(...) or 5  # Int32 value
+        >>> row_indices = [core.get(...) for _ in range(4)]  # 4 Int32 values
+        >>>
+        >>> # Gather 4 rows at computed column
+        >>> tma_gather4_load(
+        ...     tma_desc_ptr=tma_desc_ptr,
+        ...     dst_smem_ptr=smem_ptr,
+        ...     mbarrier_ptr=barrier_ptr,
+        ...     col_idx=col_idx,
+        ...     row_indices=row_indices
+        ... )
+    """
+    if len(row_indices) != 4:
+        raise ValueError(f"gather4 requires exactly 4 row indices, got {len(row_indices)}")
+    col_val = Int32(col_idx).ir_value()
+    row_vals = [Int32(row_idx).ir_value() for row_idx in row_indices]
+    # Convert pointers to integer addresses
+    desc_addr = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value()
+    dst_addr = dst_smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    mbar_addr = mbarrier_ptr.toint(loc=loc, ip=ip)
+    if num_cta > 1:
+        # Executed by both CTAs. Set peer bit to 0 so that the
+        # transaction bytes will update CTA0's barrier.
+        mbar_addr = mbar_addr & Sm100MmaPeerBitMask
+    mbar_addr = mbar_addr.ir_value()
+    # Handle multicast_mask - may already be ir.Value or Python int
+    multicast_mask_val = None
+    if multicast_mask is not None:
+        multicast_mask_val = Int16(multicast_mask).ir_value()
+    assert multicast_mask_val is None, "multicast is not supported yet"
+    # Emit inline PTX for TMA gather4
+    # PTX: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
+    #      [dstMem], [tensorMap, {col, row0, row1, row2, row3}], [smem_bar];
+    ptx = (
+        f"cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::{num_cta} "
+        "[$0], [$1, {$2, $3, $4, $5, $6}], [$7];"
+    )
+    llvm.inline_asm(
+        None,
+        [
+            dst_addr,
+            desc_addr,
+            col_val,
+            row_vals[0],
+            row_vals[1],
+            row_vals[2],
+            row_vals[3],
+            mbar_addr,
+        ],
+        ptx,
+        "r,l,r,r,r,r,r,r",  # constraints: register, long, 6x register
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
+def cpasync_bulk_get_copy_fn(
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    single_stage: bool = False,
+    **kwargs,
+) -> Callable:
+    group_rank_src = const_expr(cute.rank(src_tensor) - (1 if not single_stage else 0))
+    group_rank_dst = const_expr(cute.rank(dst_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    src = cute.group_modes(src_tensor, 0, group_rank_src)
+    dst = cute.group_modes(dst_tensor, 0, group_rank_dst)
+    def copy_bulk(src_idx, dst_idx, tma_bar_ptr: cute.Pointer, **new_kwargs):
+        atom = cute.make_copy_atom(cpasync.CopyBulkG2SOp(), src.element_type)
+        with cute.arch.elect_one():
+            cute.copy(
+                atom,
+                src[None, src_idx],
+                dst[None, dst_idx],
+                mbar_ptr=tma_bar_ptr,
+                **new_kwargs,
+                **kwargs,
+            )
+    def copy_bulk_single_stage(tma_bar_ptr: cute.Pointer, **new_kwargs):
+        atom = cute.make_copy_atom(cpasync.CopyBulkG2SOp(), src.element_type)
+        with cute.arch.elect_one():
+            cute.copy(atom, src, dst, mbar_ptr=tma_bar_ptr, **new_kwargs, **kwargs)
+    return copy_bulk if const_expr(not single_stage) else copy_bulk_single_stage
+@dsl_user_op
+def tma_get_copy_fn(
+    atom: cute.CopyAtom,
+    cta_coord: cute.Coord,
+    cta_layout: cute.Layout,
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    filter_zeros: bool = False,
+    single_stage: bool = False,
+    *,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> Callable:
+    src_is_smem = const_expr(
+        isinstance(src_tensor.iterator, cute.Pointer)
+        and src_tensor.memspace == cute.AddressSpace.smem
+    )
+    smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    group_rank_smem = const_expr(cute.rank(smem_tensor) - (1 if not single_stage else 0))
+    group_rank_gmem = const_expr(cute.rank(gmem_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    s, g = cpasync.tma_partition(
+        atom,
+        cta_coord,
+        cta_layout,
+        cute.group_modes(smem_tensor, 0, group_rank_smem),
+        cute.group_modes(gmem_tensor, 0, group_rank_gmem),
+        loc=loc,
+        ip=ip,
+    )
+    if const_expr(filter_zeros):
+        s = cute.filter_zeros(s)
+        g = cute.filter_zeros(g)
+    src, dst = (s, g) if src_is_smem else (g, s)
+    @dsl_user_op
+    def copy_tma(src_idx, dst_idx, *, loc=None, ip=None, **new_kwargs):
+        cute.copy(
+            atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs, loc=loc, ip=ip
+        )
+    @dsl_user_op
+    def copy_tma_single_stage(*, loc=None, ip=None, **new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs, loc=loc, ip=ip)
+    return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
+def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):
+    def copy_fn(src_idx, producer_state: cutlass.pipeline.PipelineState, **new_kwargs):
+        copy(
+            src_idx=src_idx,
+            dst_idx=producer_state.index,
+            tma_bar_ptr=pipeline.producer_get_barrier(producer_state),
+            **new_kwargs,
+        )
+    return copy_fn
+@cute.jit
+def gather_m_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # (tile_M, tile_K, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_M), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    tAsA = thr_copy_A.partition_D(sA)
+    # k-major
+    assert tAsA.shape[2] == 1
+    tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    m_idx = cute.make_rmem_tensor(rows_per_thread, Int32)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        row_idx = tAcA[0, m, 0][0]
+        if tApA_m[m]:
+            m_idx[m] = gsAIdx[row_idx]
+        else:
+            m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
+    mA_k = cute.logical_divide(mA, (None, tile_shape_mk[1]))
+    def copy_fn(src_idx, dst_idx, pred: bool = False):
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        mA_cur = mA_k[None, (None, src_idx)]
+        for m in cutlass.range_constexpr(tAcA.shape[1]):
+            # cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,)) would give shape
+            # ((elems_per_load), thread_per_row)
+            # But we actually want shape ((elems_per_load, 1), thread_per_row) to match tAsA
+            # So we append 1s to the last dimension and then do tiled_divide, then slice.
+            mA_row = cute.tiled_divide(
+                cute.append_ones(mA_cur[m_idx[m], None], up_to_rank=2), (elems_per_load, 1)
+            )[None, None, 0]
+            if const_expr(is_even_m_smem) or tApA_m[m]:
+                # There's only 1 load per row
+                assert cute.size(tAcA.shape, mode=[2]) == 1
+                ki = tAcA[0, 0, 0][1] // elems_per_load
+                cute.copy(thr_copy_A, mA_row[None, ki], tAsA[(None, m), dst_idx], pred=tApA_k)
+    return copy_fn
+@cute.jit
+def gather_k_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (tile_M, whatever)
+    sA: cute.Tensor,  # (tile_M, tile_K, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_K, RestK), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    gAIdx, sAIdx = None, None
+    if const_expr(gsAIdx.memspace == cute.AddressSpace.gmem):
+        gAIdx = gsAIdx
+    else:
+        assert gsAIdx.memspace == cute.AddressSpace.smem
+        sAIdx = gsAIdx
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    # (atom_v, CPY_M, 1, STAGE)
+    tAsA = thr_copy_A.partition_D(sA)
+    # m-major
+    tAsA = cute.group_modes(tAsA, 0, 3)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
+    # This is very convoluted but idk a better way
+    # for tile_M=128, flat_divide gives (8, 16, K),
+    # then logical_divide gives ((8, 1), (8, 2), K).
+    tidx = thr_copy_A.thr_idx
+    tAmA = cute.logical_divide(
+        cute.flat_divide(mA, (elems_per_load,)), (elems_per_load, threads_per_col)
+    )[None, (tidx % threads_per_col, None), None]  # ((8, 1), 2, K)
+    def prefetch_from_gmem_fn(src_idx, pred: bool = False) -> Tuple[cute.Tensor, cute.Tensor]:
+        # Prefetch mAIdx early, even before smem is free
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        gAIdx_cur = gAIdx[None, src_idx]
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            if const_expr(not pred):
+                k_idx[k] = gAIdx_cur[col_idx]
+            else:
+                if tApA_k[k]:
+                    k_idx[k] = gAIdx_cur[col_idx]
+                else:
+                    k_idx[k] = -1
+        return k_idx, tApA_k
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline, src_idx, dst_idx, a_prefetch_consumer_state, pred: bool = False
+    ) -> Tuple[cute.Tensor, cute.Tensor]:
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        sAIdx_cur = sAIdx[None, dst_idx]
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            k_idx[k] = sAIdx_cur[col_idx]
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return k_idx, tApA_k
+    def copy_fn(
+        src_idx, dst_idx, k_idx_tApA_k: Tuple[cute.Tensor, cute.Tensor], pred: bool = False
+    ):
+        k_idx, tApA_k = k_idx_tApA_k
+        tApA_k_pred = None
+        if const_expr(pred):
+            tApA_k_pred = cute.prepend_ones(tApA_k, up_to_rank=2)  # (1, cols_per_thread)
+        for k in cutlass.range_constexpr(tAcA.shape[2]):
+            # copy_A(tAmA[None, None, k_idx[k]], tAsA[(None, None, k), smem_idx], pred=cute.prepend_ones(tApA_m, up_to_rank=2))
+            for m in cutlass.range_constexpr(tAcA.shape[1]):
+                if tApA_m[m]:
+                    cute.copy(
+                        thr_copy_A,
+                        tAmA[None, m, k_idx[k]],
+                        tAsA[(None, m, k), dst_idx],
+                        pred=None if const_expr(tApA_k_pred is None) else tApA_k_pred[None, k],
+                    )
+    return copy_fn, prefetch_from_gmem_fn if const_expr(
+        gAIdx is not None
+    ) else prefetch_from_smem_fn
+@cute.jit
+def gather_m_get_tma_copy_fn(
+    tma_atom: cute.CopyAtom,
+    mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # ((4, 32), (64, 1), STAGE)
+    sAIdx: cute.Tensor,  # (tile_M),
+    warp_idx: Int32,
+    num_warps: int,
+    num_cta: int = 1,
+) -> Callable:
+    tile_M = cute.size(sAIdx, mode=[0])
+    tile_K = cute.size(sA[None, None, 0]) // tile_M
+    assert tile_M % 4 == 0
+    # cta_group = 1 if tma_atom.op.cta_group == CtaGroup.ONE else 2
+    cta_group = num_cta  # Somehow all tma_atom has CtaGroup.ONE inside the kernel
+    copy_AIdx_s2r = cute.make_tiled_copy_tv(
+        cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Int32, num_bits_per_copy=128),
+        cute.make_layout(num_warps),  # thr_layout
+        cute.make_layout(4),  # val_layout
+    )
+    warp_copy_AIdx_s2r = copy_AIdx_s2r.get_slice(warp_idx)
+    tSR_sAIdx = warp_copy_AIdx_s2r.partition_S(sAIdx)
+    # ((4, 1), 8, (64, 1), STAGE)
+    tSR_sA = warp_copy_AIdx_s2r.partition_S(sA)
+    tSR_rAIdx = load_s2r(tSR_sAIdx)
+    tma_desc_ptr = get_tma_desc_addr(tma_atom)
+    tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
+    def copy_fn(src_idx, dst_idx, tma_bar_ptr: cute.Pointer):
+        col_idx = tile_K * src_idx
+        for m in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
+            row_indices = [tSR_rAIdx[v, m] for v in range(4)]
+            smem_ptr = tSR_sA[None, m, None, dst_idx].iterator
+            with cute.arch.elect_one():
+                tma_gather4_load_fn(smem_ptr, tma_bar_ptr, col_idx, row_indices)
+    return copy_fn

build/torch-cuda/quack/cute_dsl_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple, get_origin
+from functools import lru_cache
+from dataclasses import dataclass, fields
+import torch
+try:
+    from triton.tools.disasm import extract
+except ImportError:
+    extract = None
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float16, BFloat16, Float32
+from cutlass.base_dsl.typing import JitArgument
+from cutlass.base_dsl.tvm_ffi_builder import spec
+from cutlass.cutlass_dsl import NumericMeta
+StaticTypes = (cutlass.Constexpr, NumericMeta, int, bool, str, float, type(None))
+load_cubin_module_data_og = cutlass.base_dsl.runtime.cuda.load_cubin_module_data
+cute_compile_og = cute.compile
+# Patch TVM-FFI converter to handle Constexpr type annotations as compile-time constants.
+# Fields annotated with cutlass.Constexpr[T] are emitted as ConstNone (not runtime args).
+# At call time, pass None for these fields; the compile-time value is baked in.
+import cutlass.cute._tvm_ffi_args_spec_converter as _converter_module  # noqa
+_original_convert_single_arg = _converter_module._convert_single_arg
+def _patched_convert_single_arg(arg, arg_name, arg_type, ctx):
+    if arg_type is not None and get_origin(arg_type) is cutlass.Constexpr:
+        return spec.ConstNone(arg_name)
+    # If arg is a NamedTuple but arg_type doesn't have _fields (e.g. annotated as tuple),
+    # redirect so the converter uses the NamedTuple's own type hints.
+    if (
+        isinstance(arg, tuple)
+        and hasattr(type(arg), "_fields")
+        and (arg_type is None or not hasattr(arg_type, "_fields"))
+    ):
+        return _original_convert_single_arg(arg, arg_name, type(arg), ctx)
+    return _original_convert_single_arg(arg, arg_name, arg_type, ctx)
+_converter_module._convert_single_arg = _patched_convert_single_arg
+torch2cute_dtype_map = {
+    torch.float16: Float16,
+    torch.bfloat16: BFloat16,
+    torch.float32: Float32,
+    torch.int32: Int32,
+    torch.int64: Int64,
+}
+@lru_cache
+def get_max_active_clusters(cluster_size):
+    return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+@lru_cache
+def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
+    return torch.cuda.get_device_capability(device)
+def _partition_fields(obj):
+    """Split dataclass fields into (constexpr_dict, non_constexpr_dict) by type."""
+    all_fields = {field.name: getattr(obj, field.name) for field in fields(obj)}
+    constexpr = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
+    non_constexpr = {n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)}
+    return constexpr, non_constexpr
+def _new_from_mlir_values(self, values):
+    constexpr_fields, non_constexpr_fields = _partition_fields(self)
+    for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+        non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+        values = values[n_items:]
+    return self.__class__(**non_constexpr_fields, **constexpr_fields)
+def _namedtuple_new_from_mlir_values(self, values):
+    """Generic __new_from_mlir_values__ for NamedTuples.
+    Applied to NamedTuple classes via the ``@mlir_namedtuple`` decorator.
+    Fields that are None or Constexpr (StaticTypes) are preserved from ``self`` (the compile-time
+    template). Only non-static fields consume MLIR values. Multi-value fields (e.g. cute.Tensor)
+    consume the correct number of values via ``cutlass.new_from_mlir_values``.
+    Constexpr fields (annotated ``cutlass.Constexpr[T]``) are baked into the compiled kernel via
+    a converter patch (see above). At call time, pass None for these fields.
+    """
+    from cutlass.base_dsl.typing import get_mlir_types
+    values = list(values)
+    new_fields = []
+    for field_val in self:
+        if field_val is None or isinstance(field_val, StaticTypes):
+            new_fields.append(field_val)
+        else:
+            n_items = len(get_mlir_types(field_val))
+            new_fields.append(cutlass.new_from_mlir_values(field_val, values[:n_items]))
+            values = values[n_items:]
+    return self.__class__(*new_fields)
+def mlir_namedtuple(cls):
+    """Decorator that adds MLIR value reconstruction to a NamedTuple class.
+    Usage::
+        @mlir_namedtuple
+        class MyArgs(NamedTuple):
+            tensor_arg: cute.Tensor
+            const_arg: cutlass.Constexpr[int] = 0
+    """
+    cls.__new_from_mlir_values__ = _namedtuple_new_from_mlir_values
+    return cls
+@dataclass
+class ParamsBase:
+    def __extract_mlir_values__(self):
+        _, non_constexpr_fields = _partition_fields(self)
+        values, self._values_pos = [], []
+        for obj in non_constexpr_fields.values():
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    __new_from_mlir_values__ = _new_from_mlir_values
+@dataclass
+class ArgumentsBase(JitArgument):
+    def __c_pointers__(self):
+        _, non_constexpr_fields = _partition_fields(self)
+        c_ptrs = []
+        for obj in non_constexpr_fields.values():
+            if hasattr(obj, "__c_pointers__"):
+                c_ptrs.extend(obj.__c_pointers__())
+        return c_ptrs
+    def __get_mlir_types__(self):
+        _, non_constexpr_fields = _partition_fields(self)
+        types, self._values_pos = [], []
+        for obj in non_constexpr_fields.values():
+            if hasattr(obj, "__get_mlir_types__"):
+                obj_types = obj.__get_mlir_types__()
+                types.extend(obj_types)
+                self._values_pos.append(len(obj_types))
+            else:
+                self._values_pos.append(0)
+        return types
+    __new_from_mlir_values__ = _new_from_mlir_values

build/torch-cuda/quack/layout_utils.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+def transpose_view(a: cute.Tensor) -> cute.Tensor:
+    """Transpose the first two dimensions of a tensor on smem."""
+    shape = (a.shape[1], a.shape[0], *a.shape[2:])
+    order = (1, 0, *range(2, cute.rank(a)))
+    return cute.composition(a, cute.make_ordered_layout(shape, order=order))
+def select(a: cute.Tensor, mode: list[int]) -> cute.Tensor:
+    return cute.make_tensor(a.iterator, cute.select(a.layout, mode))
+def expand(a: cute.Tensor, dim: int, size: Int32 | int) -> cute.Tensor:
+    shape = (*a.shape[:dim], size, *a.shape[dim:])
+    stride = (*a.layout.stride[:dim], 0, *a.layout.stride[dim:])
+    return cute.make_tensor(a.iterator, cute.make_layout(shape, stride=stride))
+@cute.jit
+def permute_gated_Cregs_b16(t: cute.Tensor) -> None:
+    assert t.element_type.width == 16
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b16 permutation"
+    t_u32 = cute.recast_tensor(t, Int32)
+    quad_idx = cute.arch.lane_idx() % 4
+    lane_03 = quad_idx == 0 or quad_idx == 3
+    selector_upper = Int32(0x5410) if lane_03 else Int32(0x1054)
+    selector_lower = Int32(0x7632) if lane_03 else Int32(0x3276)
+    # upper_map = [0, 3, 1, 2]
+    # lower_map = [1, 2, 0, 3]
+    # upper_idx = upper_map[quad_idx]
+    # indexing isn't supported so we have to do arithmetic
+    upper_idx = quad_idx // 2 if quad_idx % 2 == 0 else 3 - quad_idx // 2
+    lower_idx = upper_idx ^ 1
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t_u32.shape) // 2, unroll_full=True):
+        upper, lower = t_u32[i * 2 + 0], t_u32[i * 2 + 1]
+        upper0 = upper if lane_03 else lower
+        lower0 = lower if lane_03 else upper
+        upper0 = cute.arch.shuffle_sync(upper0, offset=upper_idx, mask_and_clamp=mask_and_clamp)
+        lower0 = cute.arch.shuffle_sync(lower0, offset=lower_idx, mask_and_clamp=mask_and_clamp)
+        t_u32[i * 2 + 0] = cute.arch.prmt(upper0, lower0, selector_upper)
+        t_u32[i * 2 + 1] = cute.arch.prmt(upper0, lower0, selector_lower)
+@cute.jit
+def permute_Cregs_b32_for_stsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    to
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    This is so that we can use STSM (instead of STS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [2, 0, 3, 1]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b10
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a b | c d | e f | g h -> a b | c d | f e | h g
+            left0 = left if quad_idx < 2 else right
+            right0 = right if quad_idx < 2 else left
+            # a b | c d | f e | h g -> a b | f d | c e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a e | f b | c g | h d
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a e | f b | c g | h d -> a e | b f | c g | d h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx % 2 == 0 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx % 2 == 0 else left0
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+@cute.jit
+def permute_Cregs_b32_for_ldsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    to
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    This is so that we can use LDSM (instead of LDS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [1, 3, 0, 2]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b01
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    # This is just the inverse of permute_Cregs_b32_for_stsm
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a e | b f | c g | d h -> a e | f b | c g | h d
+            left0 = left if quad_idx % 2 == 0 else right
+            right0 = right if quad_idx % 2 == 0 else left
+            # a e | f b | c g | h d -> a b | f d | c e | h g
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a b | c d | f e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | c d | f e | h g -> a b | c d | e f | g h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx < 2 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx < 2 else left0
+@cute.jit
+def concat_layout(*layouts: cute.Layout) -> cute.Layout:
+    return cute.make_layout(
+        tuple(l.shape for l in layouts),
+        stride=tuple(l.stride for l in layouts),
+    )
+def convert_layout_acc_mn(acc_layout: cute.Layout, transpose: bool = False) -> cute.Layout:
+    """
+    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
+    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
+    """
+    acc_layout_col_major = cute.make_layout(acc_layout.shape)
+    shape = (
+        (acc_layout_col_major.shape[0][1], acc_layout_col_major.shape[1]),  # MMA_M
+        (
+            acc_layout_col_major.shape[0][0],
+            *acc_layout_col_major.shape[0][2:],
+            acc_layout_col_major.shape[2],
+        ),  # MMA_N
+        *acc_layout_col_major.shape[3:],
+    )
+    stride = (
+        (acc_layout_col_major.stride[0][1], acc_layout_col_major.stride[1]),  # MMA_M
+        (
+            acc_layout_col_major.stride[0][0],
+            *acc_layout_col_major.stride[0][2:],
+            acc_layout_col_major.stride[2],
+        ),  # MMA_N
+        *acc_layout_col_major.stride[3:],
+    )
+    if const_expr(transpose):
+        shape = (shape[1], shape[0], *shape[2:])
+        stride = (stride[1], stride[0], *stride[2:])
+    acc_layout_mn = cute.make_layout(shape, stride=stride)
+    return cute.composition(acc_layout, acc_layout_mn)
+def make_acc_tensor_mn_view(acc: cute.Tensor, transpose: bool = False) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout, transpose=transpose))
+def reshape_acc_to_mn(acc: cute.Tensor, transpose: bool = False) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout, transpose=transpose))
+@cute.jit
+def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
+    # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
+    # For Sm80, as the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+    # For Sm90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
+    # If N / 8 is odd, we'll convert to ((2, 2, 1), MMA_M, N / 8, MMA_N).
+    # TODO: Sm90 FP8
+    if const_expr(cute.rank(acc_layout.shape[0]) == 3):  # Sm90
+        div = 2 if const_expr(acc_layout.shape[0][2] % 2 == 0) else 1
+        l = cute.logical_divide(
+            acc_layout, ((None, None, div), None, None)
+        )  # ((2, 2, (2, N / 16)), MMA_M, MMA_N)
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0][0], l.shape[0][1], l.shape[0][2][0]),
+                l.shape[1],
+                (l.shape[0][2][1], l.shape[2]),
+            ),
+            stride=(
+                (l.stride[0][0], l.stride[0][1], l.stride[0][2][0]),
+                l.stride[1],
+                (l.stride[0][2][1], l.stride[2]),
+            ),
+        )
+    else:  # Sm80
+        # (4, MMA_M, MMA_N) -> (4, MMA_M, (2, MMA_N / 2))
+        l = cute.logical_divide(acc_layout, (None, None, 2))
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0], l.shape[2][0]),
+                l.shape[1],
+                l.shape[2][1],
+            ),
+            stride=(
+                (l.stride[0], l.stride[2][0]),
+                l.stride[1],
+                l.stride[2][1],
+            ),
+        )
+    return rA_mma_view
+def reshape_acc_to_frgA(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_frgA(acc.layout))
+def convert_layout_zero_stride(
+    input: cute.Tensor | cute.Layout, ref_layout: cute.Layout
+) -> cute.Layout:
+    layout = input.layout if const_expr(isinstance(input, cute.Tensor)) else input
+    # Group the modes with non-zero stride in the ref_layout together,
+    # and the modes with zero stride together
+    layout_flat = cute.flatten(layout)
+    ref_layout_flat = cute.flatten(ref_layout)
+    nonzero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride != 0]
+    zero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride == 0]
+    # There's an edge case when all modes are zero stride
+    new_shape = (
+        tuple(layout_flat[i].shape for i in nonzero_modes) if len(nonzero_modes) > 0 else (1,),
+        tuple(layout_flat[i].shape for i in zero_modes),
+    )
+    new_stride = (
+        tuple(layout_flat[i].stride for i in nonzero_modes) if len(nonzero_modes) > 0 else (0,),
+        tuple(layout_flat[i].stride for i in zero_modes),
+    )
+    out_layout = cute.make_layout(new_shape, stride=new_stride)
+    if const_expr(isinstance(input, cute.Tensor)):
+        return cute.make_tensor(input.iterator, out_layout)
+    else:
+        return out_layout
+def mma_partition_C_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_C(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def mma_partition_A_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

build/torch-cuda/quack/sm90_utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Type, Union, Optional
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.hopper_helpers as sm90_utils_og
+from cutlass.cute.nvgpu import warpgroup
+from cutlass.cutlass_dsl import Numeric, dsl_user_op
+from cutlass import Float32, Int32, Boolean, const_expr
+from cutlass.utils import LayoutEnum
+@dsl_user_op
+def make_smem_layout(
+    dtype: Type[Numeric],
+    layout: LayoutEnum,
+    tile: cute.Tile,
+    stage: Optional[int] = None,
+    major_mode_size: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    shape = cute.product_each(cute.shape(tile, loc=loc, ip=ip), loc=loc, ip=ip)
+    if const_expr(major_mode_size is None):
+        major_mode_size = shape[1] if layout.is_n_major_c() else shape[0]
+    smem_layout_atom = warpgroup.make_smem_layout_atom(
+        sm90_utils_og.get_smem_layout_atom(layout, dtype, major_mode_size),
+        dtype,
+    )
+    order = (1, 0, 2) if const_expr(layout.is_m_major_c()) else (0, 1, 2)
+    smem_layout_staged = cute.tile_to_shape(
+        smem_layout_atom,
+        cute.append(shape, stage) if const_expr(stage is not None) else shape,
+        order=order if const_expr(stage is not None) else order[:2],
+    )
+    return smem_layout_staged
+# For compatibility with blackwell_helpers.py
+make_smem_layout_epi = make_smem_layout
+@dsl_user_op
+def partition_for_epilogue(
+    cT: cute.Tensor,
+    epi_tile: cute.Tile,
+    tiled_copy: cute.TiledCopy,
+    tidx: Int32,
+    reference_src: bool,  # do register tensors reference the src or dst layout of the tiled copy
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    thr_copy = tiled_copy.get_slice(tidx)
+    cT_epi = cute.flat_divide(cT, epi_tile)
+    # (CPY, CPY_M, CPY_N, EPI_M, EPI_N)
+    if const_expr(reference_src):
+        return thr_copy.partition_S(cT_epi, loc=loc, ip=ip)
+    else:
+        return thr_copy.partition_D(cT_epi, loc=loc, ip=ip)
+@cute.jit
+def gemm(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    zero_init: cutlass.Constexpr[bool] = False,
+    wg_wait: cutlass.Constexpr[int] = 0,
+    # A_in_regs: cutlass.Constexpr[bool] = False,
+    swap_AB: cutlass.Constexpr[bool] = False,
+) -> None:
+    if const_expr(swap_AB):
+        gemm(tiled_mma, acc, tCrB, tCrA, zero_init=zero_init, wg_wait=wg_wait, swap_AB=False)
+    else:
+        warpgroup.fence()
+        # We make a new mma_atom since we'll be modifying its attribute (accumulate).
+        # Otherwise the compiler complains "operand #0 does not dominate this use"
+        mma_atom = cute.make_mma_atom(tiled_mma.op)
+        mma_atom.set(warpgroup.Field.ACCUMULATE, not zero_init)
+        for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+            cute.gemm(mma_atom, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+            mma_atom.set(warpgroup.Field.ACCUMULATE, True)
+        warpgroup.commit_group()
+        if const_expr(wg_wait >= 0):
+            warpgroup.wait_group(wg_wait)
+def gemm_zero_init(
+    tiled_mma: cute.TiledMma,
+    shape: cute.Shape,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    wg_wait: int = -1,
+    swap_AB: bool = False,
+) -> cute.Tensor:
+    if const_expr(swap_AB):
+        return gemm_zero_init(
+            tiled_mma, shape[::-1], tCrB, tCrA, B_idx, A_idx, wg_wait, swap_AB=False
+        )
+    else:
+        acc = cute.make_rmem_tensor(tiled_mma.partition_shape_C(shape), Float32)
+        rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+        rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+        gemm(tiled_mma, acc, rA, rB, zero_init=True, wg_wait=wg_wait)
+        return acc
+def gemm_w_idx(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    zero_init: Boolean,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    wg_wait: int = -1,
+    swap_AB: bool = False,
+) -> None:
+    if const_expr(swap_AB):
+        gemm_w_idx(tiled_mma, acc, tCrB, tCrA, zero_init, B_idx, A_idx, wg_wait, swap_AB=False)
+    else:
+        rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+        rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+        gemm(tiled_mma, acc, rA, rB, zero_init=zero_init, wg_wait=wg_wait)
+def partition_fragment_ABC(
+    thr_mma: cute.ThrMma,
+    shape_mnk: cute.Shape,
+    sA: Optional[cute.Tensor],
+    sB: Optional[cute.Tensor],
+    swap_AB: bool = False,
+):
+    is_rs = thr_mma.op.a_src == warpgroup.OperandSource.RMEM
+    if const_expr(not swap_AB):
+        acc = cute.make_rmem_tensor(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
+        if const_expr(not is_rs):
+            assert sA is not None
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA))
+        else:
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[0], shape_mnk[2])))
+        assert sB is not None
+        tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB))
+    else:
+        acc = cute.make_rmem_tensor(
+            thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32
+        )
+        if const_expr(not is_rs):
+            assert sB is not None
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_A(sB))
+        else:  # B in rmem
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[1], shape_mnk[2])))
+        assert sA is not None
+        tCrA = thr_mma.make_fragment_B(thr_mma.partition_B(sA))
+    return acc, tCrA, tCrB

build/torch-cuda/seqlen_info.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from typing import Optional
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+"""
+This consolidates all the info related to sequence length. This is so that we can do all
+the gmem reads once at the beginning of each tile, rather than having to repeat these reads
+to compute various things like n_block_min, n_block_max, etc.
+"""
+@dataclass(frozen=True)
+class SeqlenInfo:
+    offset: cutlass.Int32
+    seqlen: cutlass.Int32
+    @staticmethod
+    def create(
+        batch_idx: cutlass.Int32,
+        seqlen_static: cutlass.Int32,
+        cu_seqlens: Optional[cute.Tensor] = None,
+        seqused: Optional[cute.Tensor] = None,
+    ):
+        offset = 0 if const_expr(cu_seqlens is None) else cu_seqlens[batch_idx]
+        if const_expr(seqused is not None):
+            seqlen = seqused[batch_idx]
+        elif const_expr(cu_seqlens is not None):
+            seqlen = cu_seqlens[batch_idx + 1] - cu_seqlens[batch_idx]
+        else:
+            seqlen = seqlen_static
+        return SeqlenInfo(offset, seqlen)
+@dataclass(frozen=True)
+class SeqlenInfoQK:
+    offset_q: cutlass.Int32
+    offset_k: cutlass.Int32
+    padded_offset_q: cutlass.Int32
+    padded_offset_k: cutlass.Int32
+    seqlen_q: cutlass.Int32
+    seqlen_k: cutlass.Int32
+    has_cu_seqlens_q: cutlass.Constexpr[bool]
+    has_cu_seqlens_k: cutlass.Constexpr[bool]
+    has_seqused_q: cutlass.Constexpr[bool]
+    has_seqused_k: cutlass.Constexpr[bool]
+    @staticmethod
+    def create(
+        batch_idx: cutlass.Int32,
+        seqlen_q_static: cutlass.Int32,
+        seqlen_k_static: cutlass.Int32,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        tile_m: cutlass.Constexpr[cutlass.Int32] = 128,
+        tile_n: cutlass.Constexpr[cutlass.Int32] = 128,
+    ):
+        offset_q = 0 if const_expr(mCuSeqlensQ is None) else mCuSeqlensQ[batch_idx]
+        offset_k = 0 if const_expr(mCuSeqlensK is None) else mCuSeqlensK[batch_idx]
+        padded_offset_q = (
+            0
+            if const_expr(mCuSeqlensQ is None)
+            else (offset_q + batch_idx * tile_m) // tile_m * tile_m
+        )
+        padded_offset_k = (
+            0
+            if const_expr(mCuSeqlensK is None)
+            else (offset_k + batch_idx * tile_n) // tile_n * tile_n
+        )
+        if const_expr(mSeqUsedQ is not None):
+            seqlen_q = mSeqUsedQ[batch_idx]
+        else:
+            seqlen_q = (
+                seqlen_q_static
+                if const_expr(mCuSeqlensQ is None)
+                else mCuSeqlensQ[batch_idx + 1] - offset_q
+            )
+        if const_expr(mSeqUsedK is not None):
+            seqlen_k = mSeqUsedK[batch_idx]
+        else:
+            seqlen_k = (
+                seqlen_k_static
+                if const_expr(mCuSeqlensK is None)
+                else mCuSeqlensK[batch_idx + 1] - offset_k
+            )
+        has_cu_seqlens_q: int = mCuSeqlensQ is not None
+        has_cu_seqlens_k: int = mCuSeqlensK is not None
+        has_seqused_q: int = mSeqUsedQ is not None
+        has_seqused_k: int = mSeqUsedK is not None
+        return SeqlenInfoQK(
+            offset_q,
+            offset_k,
+            padded_offset_q,
+            padded_offset_k,
+            seqlen_q,
+            seqlen_k,
+            has_cu_seqlens_q,
+            has_cu_seqlens_k,
+            has_seqused_q,
+            has_seqused_k,
+        )
+    def offset_batch_Q(
+        self,
+        mQ: cute.Tensor,
+        batch_idx: Int32,
+        dim: int,
+        padded: cutlass.Constexpr[bool] = False,
+    ) -> cute.Tensor:
+        """Seqlen must be the first dimension of mQ"""
+        if const_expr(not self.has_cu_seqlens_q):
+            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mQ) - 1 - dim)
+            return mQ[idx]
+        else:
+            offset_q = self.offset_q if const_expr(not padded) else self.padded_offset_q
+            offset = offset_q if const_expr(cute.rank(mQ.shape[0]) == 1) else (0, offset_q)
+            idx = (offset,) + (0,) * (cute.rank(mQ) - 1)
+            return cute.domain_offset(idx, mQ)
+    def offset_batch_K(
+        self,
+        mK: cute.Tensor,
+        batch_idx: Int32,
+        dim: int,
+        padded: cutlass.Constexpr[bool] = False,
+    ) -> cute.Tensor:
+        """Seqlen must be the first dimension of mK"""
+        if const_expr(not self.has_cu_seqlens_k):
+            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mK) - 1 - dim)
+            return mK[idx]
+        else:
+            offset_k = self.offset_k if const_expr(not padded) else self.padded_offset_k
+            idx = (offset_k,) + (0,) * (cute.rank(mK) - 1)
+            return cute.domain_offset(idx, mK)

build/torch-cuda/softmax.py ADDED Viewed

	@@ -0,0 +1,592 @@

+# Copyright (c) 2025, Tri Dao.
+import math
+import operator
+from typing import Tuple
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32
+from .quack import layout_utils
+from . import utils
+from .quack.cute_dsl_utils import ParamsBase
+from .seqlen_info import SeqlenInfoQK
+@dataclass
+class Softmax(ParamsBase):
+    scale_log2: Float32
+    num_rows: cutlass.Constexpr[int]
+    row_max: cute.Tensor
+    row_sum: cute.Tensor
+    arch: cutlass.Constexpr[int] = 80
+    softmax_scale: Float32 | None = None
+    @staticmethod
+    def create(
+        scale_log2: Float32,
+        num_rows: cutlass.Constexpr[int],
+        arch: cutlass.Constexpr[int] = 80,
+        softmax_scale: Float32 | None = None,
+    ):
+        row_max = cute.make_rmem_tensor(num_rows, Float32)
+        row_sum = cute.make_rmem_tensor(num_rows, Float32)
+        return Softmax(scale_log2, num_rows, row_max, row_sum, arch, softmax_scale)
+    def reset(self) -> None:
+        self.row_max.fill(-Float32.inf)
+        self.row_sum.fill(0.0)
+    def _compute_row_max(
+        self, acc_S_row: cute.TensorSSA, init_val: float | Float32 | None = None
+    ) -> Float32:
+        return utils.fmax_reduce(acc_S_row, init_val, arch=self.arch)
+    def _compute_row_sum(
+        self, acc_S_row_exp: cute.TensorSSA, init_val: float | Float32 | None = None
+    ) -> Float32:
+        return utils.fadd_reduce(acc_S_row_exp, init_val, arch=self.arch)
+    @cute.jit
+    def online_softmax(
+        self,
+        acc_S: cute.Tensor,
+        is_first: cutlass.Constexpr[bool] = False,
+        check_inf: cutlass.Constexpr[bool] = True,
+    ) -> cute.Tensor:
+        """Apply online softmax and return the row_scale to rescale O.
+        :param acc_S: acc_S tensor
+        :type acc_S: cute.Tensor
+        :param is_first: is first n_block
+        :type is_first: cutlass.Constexpr
+        """
+        # Change acc_S to M,N layout view.
+        acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S)
+        row_scale = cute.make_fragment_like(self.row_max, Float32)
+        row_max = self.row_max
+        row_sum = self.row_sum
+        scale_log2 = self.scale_log2
+        arch = self.arch
+        # Each iteration processes one row of acc_S
+        for r in cutlass.range(cute.size(row_max), unroll_full=True):
+            acc_S_row = acc_S_mn[r, None].load()  # (n_block_size)
+            row_max_cur = utils.fmax_reduce(
+                acc_S_row,
+                init_val=row_max[r] if cutlass.const_expr(not is_first) else None,
+                arch=arch,
+            )
+            row_max_cur = cute.arch.warp_reduction_max(row_max_cur, threads_in_group=4)
+            # Update row_max before changing row_max_cur to safe value for -inf
+            row_max_prev = row_max[r]
+            row_max[r] = row_max_cur
+            if cutlass.const_expr(check_inf):
+                row_max_cur = 0.0 if row_max_cur == -Float32.inf else row_max_cur
+            if cutlass.const_expr(is_first):
+                row_max_cur_scaled = row_max_cur * scale_log2
+                acc_S_row_exp = cute.math.exp2(
+                    acc_S_row * scale_log2 - row_max_cur_scaled, fastmath=True
+                )
+                acc_S_row_sum = utils.fadd_reduce(acc_S_row_exp, init_val=None, arch=arch)
+                row_scale[r] = 1.0
+            else:
+                row_max_cur_scaled = row_max_cur * scale_log2
+                acc_S_row_exp = cute.math.exp2(
+                    acc_S_row * scale_log2 - row_max_cur_scaled, fastmath=True
+                )
+                # row_scale[r] = cute.math.exp2(row_max_prev * self.scale_log2 - row_max_cur_scaled)
+                row_scale[r] = cute.math.exp2(
+                    (row_max_prev - row_max_cur) * scale_log2, fastmath=True
+                )
+                acc_S_row_sum = utils.fadd_reduce(
+                    acc_S_row_exp, init_val=row_sum[r] * row_scale[r], arch=arch
+                )
+            row_sum[r] = acc_S_row_sum
+            acc_S_mn[r, None].store(acc_S_row_exp)
+        return row_scale
+    @cute.jit
+    def finalize(
+        self, final_scale: Float32 = 1.0, sink_val: Float32 | cute.Tensor | None = None
+    ) -> cute.Tensor:
+        """Finalize the online softmax by computing the scale and logsumexp."""
+        if cutlass.const_expr(sink_val is not None and isinstance(sink_val, cute.Tensor)):
+            assert cute.size(sink_val) == cute.size(self.row_sum)
+        row_sum = self.row_sum
+        row_max = self.row_max
+        scale_log2 = self.scale_log2
+        # quad reduction for row_sum as we didn't do it during each iteration of online softmax
+        row_sum.store(utils.warp_reduce(row_sum.load(), operator.add, width=4))
+        row_scale = cute.make_fragment_like(row_max, Float32)
+        for r in cutlass.range(cute.size(row_sum), unroll_full=True):
+            if cutlass.const_expr(sink_val is not None):
+                sink_val_cur = sink_val if not isinstance(sink_val, cute.Tensor) else sink_val[r]
+                LOG2_E = math.log2(math.e)
+                row_sum[r] += cute.math.exp2(
+                    sink_val_cur * LOG2_E - row_max[r] * scale_log2, fastmath=True
+                )
+            # if row_sum is zero or nan, set acc_O_mn_row to 1.0
+            acc_O_mn_row_is_zero_or_nan = row_sum[r] == 0.0 or row_sum[r] != row_sum[r]
+            row_scale[r] = (
+                cute.arch.rcp_approx(row_sum[r] if not acc_O_mn_row_is_zero_or_nan else 1.0)
+            ) * final_scale
+            row_sum_cur = row_sum[r]
+            LN2 = math.log(2.0)
+            row_sum[r] = (
+                (row_max[r] * scale_log2 + cute.math.log2(row_sum_cur, fastmath=True)) * LN2
+                if not acc_O_mn_row_is_zero_or_nan
+                else -Float32.inf
+            )
+        return row_scale
+    @cute.jit
+    def rescale_O(self, acc_O: cute.Tensor, row_scale: cute.Tensor) -> None:
+        """Scale each row of acc_O by the given scale tensor.
+        :param acc_O: input tensor
+        :type acc_O: cute.Tensor
+        :param row_scale: row_scale tensor
+        :type row_scale: cute.Tensor
+        """
+        acc_O_mn = layout_utils.reshape_acc_to_mn(acc_O)
+        assert cute.size(row_scale) == cute.size(acc_O_mn, mode=[0])
+        for r in cutlass.range(cute.size(row_scale), unroll_full=True):
+            acc_O_mn[r, None].store(acc_O_mn[r, None].load() * row_scale[r])
+@dataclass
+class SoftmaxSm100(Softmax):
+    rescale_threshold: cutlass.Constexpr[float] = 0.0
+    @staticmethod
+    def create(
+        scale_log2: Float32,
+        rescale_threshold: cutlass.Constexpr[float] = 0.0,
+        softmax_scale: Float32 | None = None,
+    ):
+        num_rows = 1
+        arch = 100
+        row_max = cute.make_rmem_tensor(num_rows, Float32)
+        row_sum = cute.make_rmem_tensor(num_rows, Float32)
+        return SoftmaxSm100(
+            scale_log2,
+            num_rows,
+            row_max,
+            row_sum,
+            arch,
+            softmax_scale,
+            rescale_threshold=rescale_threshold,
+        )
+    @cute.jit
+    def update_row_max(self, acc_S_row: cute.TensorSSA, is_first: int) -> Tuple[Float32, Float32]:
+        if cutlass.const_expr(is_first):
+            row_max_new = self._compute_row_max(acc_S_row)
+            row_max_safe = row_max_new if row_max_new != -cutlass.Float32.inf else 0.0
+            acc_scale = 0.0
+        else:
+            row_max_old = self.row_max[0]
+            row_max_new = self._compute_row_max(acc_S_row, init_val=row_max_old)
+            row_max_safe = row_max_new if row_max_new != -cutlass.Float32.inf else 0.0
+            acc_scale_ = (row_max_old - row_max_safe) * self.scale_log2
+            acc_scale = cute.math.exp2(acc_scale_, fastmath=True)
+            if cutlass.const_expr(self.rescale_threshold > 0.0):
+                if acc_scale_ >= -self.rescale_threshold:
+                    row_max_new = row_max_old
+                    row_max_safe = row_max_old
+                    acc_scale = 1.0
+        self.row_max[0] = row_max_new
+        return row_max_safe, acc_scale
+    def update_row_sum(
+        self, acc_S_row_exp: cute.TensorSSA, row_scale: Float32, is_first: int = False
+    ) -> None:
+        init_val = self.row_sum[0] * row_scale if cutlass.const_expr(not is_first) else None
+        # self.row_sum[0] = self._compute_row_sum(acc_S_row_exp, init_val=self.row_sum[0] * row_scale)
+        self.row_sum[0] = self._compute_row_sum(acc_S_row_exp, init_val=init_val)
+        # tmp = self._compute_row_sum(acc_S_row_exp)
+        # self.row_sum[0] = self.row_sum[0] * row_scale + tmp
+    @cute.jit
+    def scale_subtract_rowmax(
+        self,
+        acc_S_row: cute.Tensor,
+        row_max: Float32,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        row_max_scaled = row_max * self.scale_log2
+        for i in cutlass.range(0, cute.size(acc_S_row.shape), 2, unroll_full=True):
+            acc_S_row[i], acc_S_row[i + 1] = cute.arch.fma_packed_f32x2(
+                (acc_S_row[i], acc_S_row[i + 1]),
+                (self.scale_log2, self.scale_log2),
+                (-row_max_scaled, -row_max_scaled),
+            )
+    @cute.jit
+    def apply_exp2_convert(
+        self,
+        acc_S_row: cute.Tensor,
+        acc_S_row_converted: cute.Tensor,
+        ex2_emu_freq: cutlass.Constexpr[int] = 0,
+        ex2_emu_res: cutlass.Constexpr[int] = 4,
+        ex2_emu_start_frg: cutlass.Constexpr[int] = 0,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        frg_tile = 32
+        assert frg_tile % 2 == 0
+        frg_cnt = cute.size(acc_S_row) // frg_tile
+        assert cute.size(acc_S_row) % frg_tile == 0
+        acc_S_row_frg = cute.logical_divide(acc_S_row, cute.make_layout(frg_tile))
+        acc_S_row_converted_frg = cute.logical_divide(
+            acc_S_row_converted, cute.make_layout(frg_tile)
+        )
+        for j in cutlass.range_constexpr(frg_cnt):
+            for k in cutlass.range_constexpr(0, cute.size(acc_S_row_frg, mode=[0]), 2):
+                # acc_S_row_frg[k, j] = cute.math.exp2(acc_S_row_frg[k, j], fastmath=True)
+                # acc_S_row_frg[k + 1, j] = cute.math.exp2(acc_S_row_frg[k + 1, j], fastmath=True)
+                if cutlass.const_expr(ex2_emu_freq == 0):
+                    acc_S_row_frg[k, j] = cute.math.exp2(acc_S_row_frg[k, j], fastmath=True)
+                    acc_S_row_frg[k + 1, j] = cute.math.exp2(acc_S_row_frg[k + 1, j], fastmath=True)
+                else:
+                    if cutlass.const_expr(
+                        k % ex2_emu_freq < ex2_emu_freq - ex2_emu_res
+                        or j >= frg_cnt - 1
+                        or j < ex2_emu_start_frg
+                    ):
+                        acc_S_row_frg[k, j] = cute.math.exp2(acc_S_row_frg[k, j], fastmath=True)
+                        acc_S_row_frg[k + 1, j] = cute.math.exp2(
+                            acc_S_row_frg[k + 1, j], fastmath=True
+                        )
+                    else:
+                        # acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = utils.e2e_asm2(acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j])
+                        acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = utils.ex2_emulation_2(
+                            acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j]
+                        )
+            acc_S_row_converted_frg[None, j].store(
+                acc_S_row_frg[None, j].load().to(acc_S_row_converted.element_type)
+            )
+    @cute.jit
+    def scale_apply_exp2_convert(
+        self,
+        acc_S_row: cute.Tensor,
+        row_max: Float32,
+        acc_S_row_converted: cute.Tensor,
+    ):
+        assert cute.size(acc_S_row.shape) % 2 == 0, "acc_S_row must have an even number of elements"
+        minus_row_max_scaled = -row_max * self.scale_log2
+        for i in cutlass.range_constexpr(0, cute.size(acc_S_row.shape), 2):
+            acc_S_row[i], acc_S_row[i + 1] = cute.arch.fma_packed_f32x2(
+                (acc_S_row[i], acc_S_row[i + 1]),
+                (self.scale_log2, self.scale_log2),
+                (minus_row_max_scaled, minus_row_max_scaled),
+            )
+        # for i in cutlass.range_constexpr(0, cute.size(acc_S_row.shape), 2):
+        #     acc_S_row[i], acc_S_row[i + 1] = cute.arch.fma_packed_f32x2(
+        #         (acc_S_row[i], acc_S_row[i + 1]),
+        #         (self.scale_log2, self.scale_log2),
+        #         (minus_row_max_scaled, minus_row_max_scaled),
+        #     )
+        #     acc_S_row[i] = cute.math.exp2(acc_S_row[i], fastmath=True)
+        #     acc_S_row[i + 1] = cute.math.exp2(acc_S_row[i + 1], fastmath=True)
+        frg_tile = 32
+        assert frg_tile % 2 == 0
+        frg_cnt = cute.size(acc_S_row) // frg_tile
+        assert cute.size(acc_S_row) % frg_tile == 0
+        acc_S_row_frg = cute.logical_divide(acc_S_row, cute.make_layout(frg_tile))
+        acc_S_row_converted_frg = cute.logical_divide(
+            acc_S_row_converted, cute.make_layout(frg_tile)
+        )
+        for j in cutlass.range_constexpr(frg_cnt):
+            for k in cutlass.range_constexpr(0, cute.size(acc_S_row_frg, mode=[0]), 2):
+                # acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j] = (
+                #     cute.arch.fma_packed_f32x2(
+                #         (acc_S_row_frg[k, j], acc_S_row_frg[k + 1, j]),
+                #         (self.scale_log2, self.scale_log2),
+                #         (minus_row_max_scaled, minus_row_max_scaled),
+                #     )
+                # )
+                # acc_S_row_frg[k, j] = cute.math.exp2(acc_S_row_frg[k, j], fastmath=True)
+                # acc_S_row_frg[k + 1, j] = cute.math.exp2(acc_S_row_frg[k + 1, j], fastmath=True)
+                acc_S_row_frg[k, j] = cute.math.exp2(acc_S_row_frg[k, j], fastmath=True)
+                acc_S_row_frg[k + 1, j] = cute.math.exp2(acc_S_row_frg[k + 1, j], fastmath=True)
+            acc_S_row_converted_frg[None, j].store(
+                acc_S_row_frg[None, j].load().to(acc_S_row_converted.element_type)
+            )
+@cute.jit
+def floor_if_packed(
+    q_idx,
+    qhead_per_kvhead: cutlass.Constexpr[int],
+) -> cute.Tensor:
+    """Convert q_idx to packed format for Pack-GQA."""
+    if cutlass.const_expr(qhead_per_kvhead == 1):
+        return q_idx
+    return q_idx // qhead_per_kvhead
+@cute.jit
+def apply_score_mod_inner(
+    score_tensor,
+    index_tensor,
+    score_mod: cutlass.Constexpr,
+    batch_idx,
+    head_idx,
+    softmax_scale,
+    vec_size: cutlass.Constexpr,
+    qk_acc_dtype: cutlass.Constexpr,
+    aux_tensors,
+    fastdiv_mods,
+    seqlen_info: SeqlenInfoQK,
+    constant_q_idx: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    transpose_indices: cutlass.Constexpr[bool] = False,
+):
+    """Shared implementation for applying score modification.
+    Args:
+        score_tensor: The scores to modify (acc_S for flash_fwd, tSrS_t2r for sm100)
+        index_tensor: Index positions (tScS for flash_fwd, tScS_t2r for sm100)
+        score_mod: The score modification function to apply
+        batch_idx: Batch index
+        head_idx: Head index
+        softmax_scale: Scale to apply
+        vec_size: Vector size for processing elements
+        qk_acc_dtype: Data type for accumulator
+        aux_tensors: Optional aux_tensors for FlexAttention
+        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
+        seqlen_info: Sequence length info
+        constant_q_idx: If provided, use this constant for all q_idx values
+                        If None, compute q_idx per-element
+        qhead_per_kvhead_packgqa: Pack-GQA replication factor. Divide q_idx by this
+                                  when greater than 1 so score mods see logical heads.
+        transpose_indices: If True, swap q_idx/kv_idx in index_tensor (for bwd kernel where S is transposed)
+    """
+    # Index positions in the index_tensor tuple
+    # Forward: index_tensor[...][0] = q_idx, index_tensor[...][1] = kv_idx
+    # Backward (transposed): index_tensor[...][0] = kv_idx, index_tensor[...][1] = q_idx
+    if cutlass.const_expr(transpose_indices):
+        q_idx_pos = cutlass.const_expr(1)
+        kv_idx_pos = cutlass.const_expr(0)
+    else:
+        q_idx_pos = cutlass.const_expr(0)
+        kv_idx_pos = cutlass.const_expr(1)
+    n_vals = cutlass.const_expr(cute.size(score_tensor.shape))
+    score_vec = cute.make_rmem_tensor(vec_size, qk_acc_dtype)
+    kv_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    # SSA values for batch (constant across all elements)
+    batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32).broadcast_to((vec_size,))
+    # Handle q_idx based on whether it's constant
+    q_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    # For Pack-GQA with non-constant q_idx, we need per-element head indices
+    # since a thread my process multiple query head indices
+    if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+        head_idx_vec = cute.make_rmem_tensor(vec_size, cutlass.Int32)
+    for i in cutlass.range(0, n_vals, vec_size, unroll_full=True):
+        for j in cutlass.range(vec_size, unroll_full=True):
+            score_vec[j] = score_tensor[i + j] * softmax_scale
+            # Extract head offset from packed q_idx for Pack-GQA
+            if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+                q_idx_packed = index_tensor[i + j][q_idx_pos]
+                # Building up the logical q_head idx: final_q_head = kv_head * qhead_per_kvhead + (q_physical % qhead_per_kvhead)
+                q_idx_logical = q_idx_packed // qhead_per_kvhead
+                head_offset = q_idx_packed - q_idx_logical * qhead_per_kvhead
+                head_idx_vec[j] = head_idx * qhead_per_kvhead + head_offset
+            # If we will do loads we mod, in order to not read OOB
+            if cutlass.const_expr(aux_tensors is not None and fastdiv_mods is not None):
+                if cutlass.const_expr(constant_q_idx is None):
+                    seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
+                    q_idx_floored = floor_if_packed(
+                        index_tensor[i + j][q_idx_pos], qhead_per_kvhead
+                    )
+                    _, q_idx_wrapped = divmod(q_idx_floored, seqlen_q_divmod)
+                    q_idx_vec[j] = q_idx_wrapped
+                else:
+                    _, seqlen_k_divmod = fastdiv_mods
+                _, kv_idx_wrapped = divmod(index_tensor[i + j][kv_idx_pos], seqlen_k_divmod)
+                kv_idx_vec[j] = kv_idx_wrapped
+            else:
+                # No bounds checking - direct indexing
+                if constant_q_idx is None:
+                    q_idx_vec[j] = floor_if_packed(index_tensor[i + j][q_idx_pos], qhead_per_kvhead)
+                kv_idx_vec[j] = index_tensor[i + j][kv_idx_pos]
+        # Convert to SSA for score_mod call
+        score_ssa = score_vec.load()
+        kv_idx_ssa = kv_idx_vec.load()
+        if cutlass.const_expr(constant_q_idx is None):
+            q_idx_ssa = q_idx_vec.load()
+        else:
+            # NB we do not apply Pack-GQA division here, as constant_q_idx is assumed to already be logical
+            q_idx_const = constant_q_idx
+            q_idx_ssa = utils.scalar_to_ssa(q_idx_const, cutlass.Int32).broadcast_to((vec_size,))
+        # Compute head_idx_ssa: per-element for Pack-GQA with non-constant q_idx, constant otherwise
+        if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+            head_idx_ssa = head_idx_vec.load()
+        else:
+            head_idx_ssa = utils.scalar_to_ssa(head_idx, cutlass.Int32).broadcast_to((vec_size,))
+        aux_args = []
+        if cutlass.const_expr(aux_tensors is not None):
+            aux_args = aux_tensors
+        post_mod_scores = score_mod(
+            score_ssa,
+            batch_idx_ssa,
+            head_idx_ssa,
+            q_idx=q_idx_ssa,
+            kv_idx=kv_idx_ssa,
+            seqlen_info=seqlen_info,
+            aux_tensors=aux_args,
+        )
+        # Write back modified scores
+        score_vec.store(post_mod_scores)
+        for j in cutlass.range(vec_size, unroll_full=True):
+            score_tensor[i + j] = score_vec[j]
+@cute.jit
+def apply_score_mod_bwd_inner(
+    grad_tensor,
+    score_tensor,
+    index_tensor,
+    score_mod_bwd: cutlass.Constexpr,
+    batch_idx,
+    head_idx,
+    softmax_scale,
+    vec_size: cutlass.Constexpr,
+    qk_acc_dtype: cutlass.Constexpr,
+    aux_tensors,
+    fastdiv_mods,
+    seqlen_info,
+    constant_q_idx: cutlass.Constexpr,
+    qhead_per_kvhead: cutlass.Constexpr[int] = 1,
+    transpose_indices: cutlass.Constexpr[bool] = False,
+):
+    """Apply backward score modification (joint graph).
+    Args:
+        grad_tensor: in/out: dlogits rewritten in-place with d(scaled_scores)
+        score_tensor: pre-mod scores (unscaled QK tile), scaled by softmax_scale internally
+        index_tensor: Index positions (same as forward)
+        score_mod_bwd: The backward score modification function (joint graph)
+        batch_idx: Batch index
+        head_idx: Head index
+        softmax_scale: Scale to apply to score_tensor
+        vec_size: Vector size for processing elements
+        qk_acc_dtype: Data type for accumulator
+        aux_tensors: Optional aux_tensors for FlexAttention
+        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
+        seqlen_info: Sequence length info
+        constant_q_idx: If provided, use this constant for all q_idx values
+        qhead_per_kvhead: Pack-GQA replication factor
+        transpose_indices: If True, swap q_idx/kv_idx in index_tensor
+    """
+    # Index positions in the index_tensor tuple
+    # Forward: index_tensor[...][0] = q_idx, index_tensor[...][1] = kv_idx
+    # Backward (transposed): index_tensor[...][0] = kv_idx, index_tensor[...][1] = q_idx
+    if cutlass.const_expr(transpose_indices):
+        q_idx_pos = cutlass.const_expr(1)
+        kv_idx_pos = cutlass.const_expr(0)
+    else:
+        q_idx_pos = cutlass.const_expr(0)
+        kv_idx_pos = cutlass.const_expr(1)
+    n_vals = cutlass.const_expr(cute.size(grad_tensor.shape))
+    grad_vec = cute.make_fragment(vec_size, qk_acc_dtype)
+    score_vec = cute.make_fragment(vec_size, qk_acc_dtype)
+    kv_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    batch_idx_ssa = utils.scalar_to_ssa(batch_idx, cutlass.Int32).broadcast_to((vec_size,))
+    q_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    # For Pack-GQA with non-constant q_idx, we need per-element head indices
+    if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+        head_idx_vec = cute.make_fragment(vec_size, cutlass.Int32)
+    for i in cutlass.range(0, n_vals, vec_size, unroll_full=True):
+        for j in cutlass.range(vec_size, unroll_full=True):
+            grad_vec[j] = grad_tensor[i + j]
+            # Scale score so joint graph sees same value as forward score_mod
+            score_vec[j] = score_tensor[i + j] * softmax_scale
+            if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+                q_idx_packed = index_tensor[i + j][q_idx_pos]
+                q_idx_logical = q_idx_packed // qhead_per_kvhead
+                head_offset = q_idx_packed - q_idx_logical * qhead_per_kvhead
+                head_idx_vec[j] = head_idx * qhead_per_kvhead + head_offset
+            if cutlass.const_expr(aux_tensors is not None and fastdiv_mods is not None):
+                if cutlass.const_expr(constant_q_idx is None):
+                    seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
+                    q_idx_floored = floor_if_packed(
+                        index_tensor[i + j][q_idx_pos], qhead_per_kvhead
+                    )
+                    _, q_idx_wrapped = divmod(q_idx_floored, seqlen_q_divmod)
+                    q_idx_vec[j] = q_idx_wrapped
+                else:
+                    _, seqlen_k_divmod = fastdiv_mods
+                _, kv_idx_wrapped = divmod(index_tensor[i + j][kv_idx_pos], seqlen_k_divmod)
+                kv_idx_vec[j] = kv_idx_wrapped
+            else:
+                # No bounds checking - direct indexing
+                if constant_q_idx is None:
+                    q_idx_vec[j] = floor_if_packed(index_tensor[i + j][q_idx_pos], qhead_per_kvhead)
+                kv_idx_vec[j] = index_tensor[i + j][kv_idx_pos]
+        grad_ssa = grad_vec.load()
+        score_ssa = score_vec.load()
+        kv_idx_ssa = kv_idx_vec.load()
+        if cutlass.const_expr(constant_q_idx is None):
+            q_idx_ssa = q_idx_vec.load()
+        else:
+            q_idx_ssa = utils.scalar_to_ssa(constant_q_idx, cutlass.Int32).broadcast_to((vec_size,))
+        if cutlass.const_expr(qhead_per_kvhead > 1 and constant_q_idx is None):
+            head_idx_ssa = head_idx_vec.load()
+        else:
+            head_idx_ssa = utils.scalar_to_ssa(head_idx, cutlass.Int32).broadcast_to((vec_size,))
+        aux_args = []
+        if cutlass.const_expr(aux_tensors is not None):
+            aux_args = aux_tensors
+        grad_out_ssa = score_mod_bwd(
+            grad_ssa,
+            score_ssa,
+            batch_idx_ssa,
+            head_idx_ssa,
+            q_idx=q_idx_ssa,
+            kv_idx=kv_idx_ssa,
+            seqlen_info=seqlen_info,
+            aux_tensors=aux_args,
+        )
+        grad_vec.store(grad_out_ssa)
+        for j in cutlass.range(vec_size, unroll_full=True):
+            grad_tensor[i + j] = grad_vec[j]

build/torch-cuda/testing.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import math
+from contextlib import nullcontext
+from functools import wraps
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch._guards import active_fake_mode
+from torch._subclasses.fake_tensor import FakeTensorMode
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"),
+            0,
+            repeat(indices, "z -> z d", d=second_dim),
+        ).reshape(-1, *other_shape)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
+        )
+        output[indices] = values
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        grad_values = grad_output[indices]
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    all_masks = (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    in_fake_mode = active_fake_mode() is not None
+    if not in_fake_mode:
+        indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+        max_seqlen_in_batch = seqlens_in_batch.max().item()
+    else:
+        # torch.nonzero and .item() are not supported in FakeTensorMode
+        batch_size, seqlen = attention_mask.shape
+        indices = torch.arange(batch_size * seqlen, device=hidden_states.device)
+        max_seqlen_in_batch = seqlen
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+def pad_input(hidden_states, indices, batch, seqlen):
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random", zero_lengths=False):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
+    elif mode == "random":
+        lengths = torch.randint(
+            max(0 if zero_lengths else 1, max_seqlen - 20),
+            max_seqlen + 1,
+            (batch_size, 1),
+            device=device,
+        )
+    else:
+        lengths = torch.randint(
+            max(0 if zero_lengths else 1, max_seqlen // 3),
+            max_seqlen + 1,
+            (batch_size, 1),
+            device=device,
+        )
+    if zero_lengths:
+        for i in range(batch_size):
+            if i % 5 == 0:
+                lengths[i] = 0
+        lengths[-1] = 0
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
+    )
+    return padding_mask
+def generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    qv=None,
+    kvpacked=False,
+    qkvpacked=False,
+    query_unused_mask=None,
+    key_unused_mask=None,
+):
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    d_v = v.shape[-1]
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d_v)
+    if query_unused_mask is not None or key_unused_mask is not None:
+        assert not kvpacked
+        assert not qkvpacked
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, seqused_q = unpad_input(
+            q, query_padding_mask, query_unused_mask
+        )
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+        qv_unpad = rearrange(qv, "b s ... -> (b s) ...")[indices_q] if qv is not None else None
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
+        )
+        seqused_q = None
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+        qv_unpad = rearrange(qv, "b s ... -> (b s) ...") if qv is not None else None
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k, seqused_k = unpad_input(
+            k, key_padding_mask, key_unused_mask
+        )
+        v_unpad, *_ = unpad_input(v, key_padding_mask, key_unused_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
+        )
+        seqused_k = None
+        max_seqlen_k = seqlen_k
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            qv_unpad.detach() if qv is not None else None,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            qv.detach() if qv is not None else None,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(None, None),
+    sink_token_length=0,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    device=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] is None:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        if window_size[1] is None:
+            local_mask_left = col_idx > sk
+        else:
+            local_mask_left = col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk)
+        return torch.logical_or(
+            local_mask_left,
+            torch.logical_and(
+                col_idx < row_idx + sk - sq - window_size[0], col_idx >= sink_token_length
+            ),
+        )
+def construct_chunk_mask(
+    seqlen_q,
+    seqlen_k,
+    attention_chunk,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    device=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+    col_limit_left_chunk = row_idx + sk - sq - (row_idx + sk - sq) % attention_chunk
+    return torch.logical_or(
+        col_idx < col_limit_left_chunk, col_idx >= col_limit_left_chunk + attention_chunk
+    )
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    qv=None,
+    q_descale=None,
+    k_descale=None,
+    v_descale=None,
+    window_size=(None, None),
+    attention_chunk=0,
+    sink_token_length=0,
+    learnable_sink: Optional[torch.Tensor] = None,
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    intermediate_dtype=None,
+):
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+        qv = qv.float() if qv is not None else None
+    if q_descale is not None:
+        q_descale = repeat(q_descale, "b h -> b 1 (h g) 1", g=q.shape[2] // k.shape[2])
+        q = (q.float() * q_descale).to(q.dtype)
+        qv = (qv.float() * q_descale).to(qv.dtype) if qv is not None else None
+    if k_descale is not None:
+        k = (k.float() * rearrange(k_descale, "b h -> b 1 h 1")).to(dtype=k.dtype)
+    if v_descale is not None:
+        v = (v.float() * rearrange(v_descale, "b h -> b 1 h 1")).to(dtype=v.dtype)
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    dv = v.shape[-1]
+    softmax_scale = 1.0 / math.sqrt(d if qv is None else d + dv)
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q * softmax_scale, k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+    if qv is not None:
+        scores = scores + torch.einsum("bthd,bshd->bhts", qv * softmax_scale, v)
+    if softcap > 0:
+        scores = torch.tanh(scores / softcap) * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    local_mask = None
+    if window_size[0] is not None or window_size[1] is not None:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            sink_token_length,
+            query_padding_mask,
+            key_padding_mask,
+            key_leftpad=key_leftpad,
+            device=q.device,
+        )
+    if attention_chunk > 0:
+        chunk_mask = construct_chunk_mask(
+            seqlen_q,
+            seqlen_k,
+            attention_chunk,
+            query_padding_mask,
+            key_padding_mask,
+            key_leftpad=key_leftpad,
+            device=q.device,
+        )
+        local_mask = (
+            torch.logical_or(local_mask, chunk_mask) if local_mask is not None else chunk_mask
+        )
+    if local_mask is not None:
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    if learnable_sink is None:
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    else:
+        scores_fp32 = scores.to(torch.float32)
+        logits_max = torch.amax(scores_fp32, dim=-1, keepdim=True)
+        learnable_sink = rearrange(learnable_sink, "h -> h 1 1")
+        logits_or_sinks_max = torch.maximum(learnable_sink, logits_max)
+        unnormalized_scores = torch.exp(scores_fp32 - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + torch.exp(
+            learnable_sink - logits_or_sinks_max
+        )
+        attention = (unnormalized_scores / normalizer).to(v.dtype)
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    if key_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0)
+    if local_mask is not None:
+        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    if intermediate_dtype is not None:
+        attention_drop = attention_drop.to(intermediate_dtype).to(attention_drop.dtype)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+def maybe_fake_tensor_mode(fake: bool = True):
+    """
+    One way to populate/pre-compile cache is to use torch fake tensor mode,
+    which does not allocate actual GPU tensors but retains tensor shape/dtype
+    metadata for cute.compile.
+    """
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with FakeTensorMode() if fake else nullcontext():
+                return fn(*args, **kwargs)
+        return wrapper
+    return decorator
+def is_fake_mode() -> bool:
+    return active_fake_mode() is not None

build/torch-cuda/tile_scheduler.py ADDED Viewed

	@@ -0,0 +1,727 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple
+from dataclasses import dataclass
+try:
+    from typing import override
+except ImportError:  # Python < 3.12
+    from typing_extensions import override
+import cutlass
+from cutlass._mlir import ir
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+from cutlass.cute import FastDivmodDivisor
+from .quack.cute_dsl_utils import ParamsBase
+from . import utils
+from .fast_math import clz
+class WorkTileInfo(cutlass.utils.WorkTileInfo):
+    """Altered WorkTileInfo which includes four axes: (block, head, batch, split)"""
+    @override
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo":
+        assert len(values) == 5
+        new_tile_idx = cutlass.new_from_mlir_values(self._tile_idx, values[:-1])
+        new_is_valid_tile = cutlass.new_from_mlir_values(self._is_valid_tile, [values[-1]])
+        return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+@dataclass
+class TileSchedulerArguments(ParamsBase):
+    num_block: Int32
+    num_head: Int32
+    num_batch: Int32
+    num_splits: Int32
+    seqlen_k: Int32
+    headdim: Int32
+    headdim_v: Int32
+    total_q: Int32
+    tile_shape_mn: cutlass.Constexpr[Tuple[int, int]]
+    cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+    mCuSeqlensQ: Optional[cute.Tensor] = None
+    mSeqUsedQ: Optional[cute.Tensor] = None
+    qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1
+    element_size: cutlass.Constexpr[int] = 2
+    is_persistent: cutlass.Constexpr[bool] = False
+    lpt: cutlass.Constexpr[bool] = False
+    is_split_kv: cutlass.Constexpr[bool] = False
+    head_swizzle: cutlass.Constexpr[bool] = False
+class SingleTileScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_block: Int32
+        num_head: Int32
+        num_batch: Int32
+        num_splits: Int32
+        num_splits_divmod: FastDivmodDivisor
+        is_split_kv: cutlass.Constexpr[bool] = False
+        cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+        @staticmethod
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileScheduler.Params":
+            return SingleTileScheduler.Params(
+                args.num_block,
+                args.num_head,
+                args.num_batch,
+                args.num_splits,
+                FastDivmodDivisor(args.num_splits),
+                args.is_split_kv,
+                args.cluster_shape_mn,
+            )
+    def __init__(self, params: Params, blk_coord: cute.Coord, *, loc=None, ip=None):
+        self.params = params
+        self._blk_coord = blk_coord
+        self._is_first_block = True
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileScheduler":
+        # if const_expr(cute.size(params.cluster_shape_mn) == 1):
+        #     blk_coord = cute.arch.block_idx()
+        # else:
+        #     # All CTAs in a cluster must get the same block coordinate
+        #     blk_coord = cute.arch.cluster_idx()
+        # Temporary set to block_idx until we sort out the best way to handle cluster
+        blk_coord = cute.arch.block_idx()
+        return SingleTileScheduler(params, blk_coord, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        # TODO: this hard-codes the fact that we only use cluster = (1, 1) or (2, 1)
+        assert params.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
+        return (
+            cute.round_up(params.num_block, params.cluster_shape_mn[0]),
+            params.num_head * params.num_splits,
+            params.num_batch,
+        )
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        block_idx, head_idx, batch_idx = self._blk_coord
+        if const_expr(self.params.is_split_kv):
+            head_idx, split_idx = divmod(head_idx, self.params.num_splits_divmod)
+        else:
+            split_idx = Int32(0)
+        return WorkTileInfo(
+            (block_idx, head_idx, batch_idx, split_idx),
+            self._is_first_block,
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        self._is_first_block = False
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._blk_coord]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._blk_coord], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return SingleTileScheduler(*(tuple(obj_list)), loc=self._loc)
+class StaticPersistentTileScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_block_cluster_divmod: FastDivmodDivisor
+        num_head_divmod: FastDivmodDivisor
+        total_blocks_cluster: Int32
+        cluster_shape_m: cutlass.Constexpr[int] = 1
+        @staticmethod
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "StaticPersistentTileScheduler.Params":
+            num_block_cluster = cute.ceil_div(args.num_block, cute.size(args.cluster_shape_mn))
+            total_blocks_cluster = num_block_cluster * args.num_head * args.num_batch
+            return StaticPersistentTileScheduler.Params(
+                FastDivmodDivisor(num_block_cluster),
+                FastDivmodDivisor(args.num_head),
+                total_blocks_cluster,
+                cluster_shape_m=args.cluster_shape_mn[0],
+            )
+    def __init__(self, params: Params, tile_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return StaticPersistentTileScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "StaticPersistentTileScheduler":
+        if const_expr(cute.size(params.cluster_shape_m) == 1):
+            tile_idx = cute.arch.block_idx()[0]
+        else:
+            tile_idx = cute.arch.cluster_idx()[0]
+        return StaticPersistentTileScheduler(params, tile_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        hardware_info = cutlass.utils.HardwareInfo()
+        sm_count = hardware_info.get_device_multiprocessor_count()
+        # Grid must be a multiple of cluster_shape_m for CUDA cluster launch.
+        max_ctas = (sm_count // params.cluster_shape_m) * params.cluster_shape_m
+        grid_x = cutlass.min(max_ctas, params.total_blocks_cluster * params.cluster_shape_m)
+        return (grid_x, Int32(1), Int32(1))
+    # @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        hn_idx, block_idx = divmod(self._tile_idx, self.params.num_block_cluster_divmod)
+        batch_idx, head_idx = divmod(hn_idx, self.params.num_head_divmod)
+        is_valid = self._tile_idx < self.params.total_blocks_cluster
+        # if cute.arch.thread_idx()[0] == 0:
+        #     cute.printf("TileScheduler: tile_idx=%d, hn_idx=%d, block_idx=%d, batch_idx=%d, head_idx=%d, is_valid=%d", self._tile_idx, hn_idx, block_idx, batch_idx, head_idx, is_valid)
+        return WorkTileInfo(
+            (Int32(block_idx), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        if const_expr(self.params.cluster_shape_m == 1):
+            self._tile_idx += cute.arch.grid_dim()[0]
+        else:
+            self._tile_idx += cute.arch.cluster_dim()[0]
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.params, self._tile_idx],
+            self._values_pos,
+        ):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return StaticPersistentTileScheduler(*(tuple(obj_list)), loc=self._loc)
+class SingleTileLPTScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        total_blocks: Int32
+        num_splits: Int32
+        num_block: Int32
+        l2_minor: Int32
+        num_block_divmod: FastDivmodDivisor
+        num_head_divmod: FastDivmodDivisor
+        l2_minor_divmod: FastDivmodDivisor
+        l2_major_divmod: FastDivmodDivisor
+        l2_minor_residual_divmod: FastDivmodDivisor
+        num_hb_quotient: Int32
+        is_split_kv: cutlass.Constexpr[bool] = False
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileLPTScheduler.Params":
+            # cute.printf(args.num_block, args.num_head, args.num_batch, args.seqlen_k, args.headdim, args.headdim_v, args.total_q, args.tile_shape_mn, args.qhead_per_kvhead_packgqa, args.element_size)
+            size_one_kv_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
+            size_one_head = size_one_kv_head
+            size_l2 = 50 * 1024 * 1024  # 40 MB for K & V
+            # Swizzle is the size of each "section". Round swizzle to a power of 2
+            # Need to be careful about the case where only one head will fit
+            # swizzle is how many heads can fit in L2
+            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
+            # Seems faster if swizzle if a power of 2
+            log2_floor = lambda n: 31 - clz(n)
+            swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
+            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
+            # If we're in the last section (called residual), we don't want to divide by
+            # swizzle. Instead we want to divide by the remainder.
+            num_hb_quotient = (args.num_head * args.num_batch) // swizzle
+            num_hb_remainder = (args.num_head * args.num_batch) % swizzle
+            return SingleTileLPTScheduler.Params(
+                total_blocks=args.num_block * args.num_head * args.num_batch,
+                num_block=args.num_block,
+                l2_minor=Int32(swizzle),
+                num_block_divmod=FastDivmodDivisor(args.num_block),
+                num_head_divmod=FastDivmodDivisor(args.num_head),
+                l2_minor_divmod=FastDivmodDivisor(swizzle),
+                l2_major_divmod=FastDivmodDivisor(swizzle * args.num_block),
+                l2_minor_residual_divmod=FastDivmodDivisor(
+                    max(num_hb_remainder, 1)
+                ),  # don't divide by 0
+                num_hb_quotient=Int32(num_hb_quotient),
+                num_splits=args.num_splits,
+                is_split_kv=args.is_split_kv,
+            )
+    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._split_idx = split_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileLPTScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    @cute.jit
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileLPTScheduler":
+        tile_idx, split_idx, _ = cute.arch.block_idx()
+        return SingleTileLPTScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        return (params.total_blocks, params.num_splits, Int32(1))
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        params = self.params
+        # Implement LPT scheduling coordinate calculation
+        bidhb, l2_mod = divmod(self._tile_idx, params.l2_major_divmod)
+        # If we're in the last section (called residual), we don't want to divide by
+        # swizzle. Instead we want to divide by the remainder.
+        block, bidhb_residual = 0, 0
+        if bidhb < params.num_hb_quotient:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_divmod)
+        else:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_residual_divmod)
+        bidhb_actual = bidhb * params.l2_minor + bidhb_residual
+        batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
+        # Longest-processing-time-first
+        block = params.num_block - 1 - block
+        is_valid = self._tile_idx < params.total_blocks
+        return WorkTileInfo(
+            (Int32(block), Int32(head_idx), Int32(batch_idx), Int32(self._split_idx)), is_valid
+        )
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._tile_idx = self.params.total_blocks
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx, self._split_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._tile_idx, self._split_idx], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return self.__class__(*(tuple(obj_list)), loc=self._loc)
+class SingleTileLPTBwdScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        total_blocks: Int32
+        num_block: Int32
+        l2_minor: Int32
+        num_head_divmod: FastDivmodDivisor
+        l2_minor_divmod: FastDivmodDivisor
+        l2_major_divmod: FastDivmodDivisor
+        l2_minor_residual_divmod: FastDivmodDivisor
+        num_hb_quotient: Int32
+        cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+        spt: cutlass.Constexpr[bool] = True
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileLPTBwdScheduler.Params":
+            size_l2 = 50 * 1024 * 1024
+            size_one_qdo_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
+            # size_one_dqaccum_head = args.seqlen_k * (args.headdim) * 4
+            size_one_dqaccum_head = 0
+            size_one_head = size_one_qdo_head + size_one_dqaccum_head
+            log2_floor = lambda n: 31 - clz(n)
+            swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
+            # swizzle = 8
+            # If we're in the last section (called residual), we don't want to divide by
+            # swizzle. Instead we want to divide by the remainder.
+            num_hb_quotient = (args.num_head * args.num_batch) // swizzle
+            num_hb_remainder = (args.num_head * args.num_batch) % swizzle
+            num_block = cute.ceil_div(args.num_block, args.cluster_shape_mn[0])
+            return SingleTileLPTBwdScheduler.Params(
+                total_blocks=(num_block * args.cluster_shape_mn[0])
+                * args.num_head
+                * args.num_batch,
+                num_block=num_block,
+                l2_minor=Int32(swizzle),
+                num_head_divmod=FastDivmodDivisor(args.num_head),
+                l2_minor_divmod=FastDivmodDivisor(swizzle),
+                l2_major_divmod=FastDivmodDivisor(swizzle * num_block),
+                l2_minor_residual_divmod=FastDivmodDivisor(
+                    max(num_hb_remainder, 1)
+                ),  # don't divide by 0
+                num_hb_quotient=Int32(num_hb_quotient),
+                cluster_shape_mn=args.cluster_shape_mn,
+                spt=args.lpt,
+            )
+    def __init__(self, params: Params, tile_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileLPTBwdScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    @cute.jit
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileLPTBwdScheduler":
+        tile_idx = cute.arch.block_idx()[0]
+        return SingleTileLPTBwdScheduler(params, tile_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        return (params.total_blocks, Int32(1), Int32(1))
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> cutlass.utils.WorkTileInfo:
+        cluster_idx = self._tile_idx // self.params.cluster_shape_mn[0]
+        params = self.params
+        # Implement LPT scheduling coordinate calculation
+        bidhb, l2_mod = divmod(cluster_idx, params.l2_major_divmod)
+        # If we're in the last section (called residual), we don't want to divide by
+        # swizzle. Instead we want to divide by the remainder.
+        block, bidhb_residual = 0, 0
+        if bidhb < params.num_hb_quotient:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_divmod)
+        else:
+            block, bidhb_residual = divmod(l2_mod, params.l2_minor_residual_divmod)
+        bidhb_actual = bidhb * params.l2_minor + bidhb_residual
+        batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
+        if cutlass.const_expr(params.spt):
+            block = params.num_block - 1 - block
+        if cutlass.const_expr(params.cluster_shape_mn[0] > 1):
+            bidx_in_cluster = cute.arch.block_in_cluster_idx()
+            block = block * params.cluster_shape_mn[0] + bidx_in_cluster[0]
+        is_valid = self._tile_idx < params.total_blocks
+        return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid)
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._tile_idx = self.params.total_blocks
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.params, self._tile_idx], self._values_pos):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return self.__class__(*(tuple(obj_list)), loc=self._loc)
+class SingleTileVarlenScheduler:
+    @dataclass
+    class Params(ParamsBase):
+        num_head: Int32
+        num_batch: Int32
+        total_q: Int32
+        num_splits: Int32
+        max_kvblock_in_l2: Int32
+        tile_shape_mn: cutlass.Constexpr[Tuple[int, int]]
+        mCuSeqlensQ: Optional[cute.Tensor] = None
+        mSeqUsedQ: Optional[cute.Tensor] = None
+        qhead_per_kvhead_packgqa: cutlass.Constexpr[int] = 1
+        lpt: cutlass.Constexpr[bool] = False
+        is_split_kv: cutlass.Constexpr[bool] = False
+        head_swizzle: cutlass.Constexpr[bool] = False
+        cluster_shape_m: cutlass.Constexpr[int] = 1
+        @staticmethod
+        @cute.jit
+        def create(
+            args: TileSchedulerArguments, *, loc=None, ip=None
+        ) -> "SingleTileVarlenScheduler.Params":
+            size_l2 = 50 * 1024 * 1024  # 50 MB for K & V
+            max_kvblock_in_l2 = size_l2 // (
+                (args.headdim + args.headdim_v) * args.element_size * args.tile_shape_mn[1]
+            )
+            assert args.mCuSeqlensQ is not None or args.mSeqUsedQ is not None, (
+                "At least one of mCuSeqlensQ or mSeqUsedQ must be provided"
+            )
+            assert args.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
+            return SingleTileVarlenScheduler.Params(
+                num_head=args.num_head,
+                num_batch=args.num_batch,
+                total_q=args.total_q,
+                num_splits=args.num_splits,
+                max_kvblock_in_l2=max_kvblock_in_l2,
+                tile_shape_mn=args.tile_shape_mn,
+                mCuSeqlensQ=args.mCuSeqlensQ,
+                mSeqUsedQ=args.mSeqUsedQ,
+                qhead_per_kvhead_packgqa=args.qhead_per_kvhead_packgqa,
+                lpt=args.lpt,
+                is_split_kv=args.is_split_kv,
+                head_swizzle=args.head_swizzle,
+                cluster_shape_m=args.cluster_shape_mn[0],
+            )
+    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
+        self.params = params
+        self._tile_idx = tile_idx
+        self._split_idx = split_idx
+        self._is_first_block = True
+        self._loc = loc
+        self._ip = ip
+    @staticmethod
+    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
+        return SingleTileVarlenScheduler.Params.create(args, loc=loc, ip=ip)
+    @staticmethod
+    def create(params: Params, *, loc=None, ip=None) -> "SingleTileVarlenScheduler":
+        tile_idx, split_idx, _ = cute.arch.block_idx()
+        return SingleTileVarlenScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: Params,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Int32, Int32, Int32]:
+        total_blocks_max = (
+            params.total_q
+            + params.num_batch * (params.cluster_shape_m * params.tile_shape_mn[0] - 1)
+        ) // params.tile_shape_mn[0]
+        # round down to nearest multiple of cluster since odd excess is always padding
+        total_blocks_max = total_blocks_max // params.cluster_shape_m * params.cluster_shape_m
+        return (total_blocks_max * params.num_head, params.num_splits, Int32(1))
+    @cute.jit
+    def _get_num_m_blocks(self, lane: Int32, bidb_start: Int32) -> Int32:
+        params = self.params
+        batch_idx = lane + bidb_start
+        if cutlass.const_expr(params.mSeqUsedQ is not None):
+            seqlen = Int32(0)
+            if batch_idx < params.num_batch:
+                seqlen = params.mSeqUsedQ[batch_idx]
+        else:
+            assert params.mCuSeqlensQ is not None
+            cur_cu_seqlen = Int32(0)
+            if batch_idx <= params.num_batch:
+                cur_cu_seqlen = params.mCuSeqlensQ[batch_idx]
+            next_cu_seqlen = cute.arch.shuffle_sync_down(cur_cu_seqlen, offset=1)
+            seqlen = next_cu_seqlen - cur_cu_seqlen
+        if cutlass.const_expr(params.qhead_per_kvhead_packgqa > 1):
+            seqlen *= params.qhead_per_kvhead_packgqa
+        return (
+            cute.ceil_div(cute.ceil_div(seqlen, params.tile_shape_mn[0]), params.cluster_shape_m)
+            if batch_idx < params.num_batch and lane < cute.arch.WARP_SIZE - 1
+            else Int32(0)
+        )
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        params = self.params
+        lane_idx = cute.arch.lane_idx()
+        num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=0)
+        num_m_blocks_cumulative = utils.warp_prefix_sum(num_m_blocks, lane_idx)
+        # Total number of blocks for the next 31 batches
+        m_blocks_in_group = cute.arch.shuffle_sync(num_m_blocks_cumulative, cute.arch.WARP_SIZE - 1)
+        # Same for all lanes
+        group_end_tile = m_blocks_in_group * params.num_head
+        # if cute.arch.thread_idx()[0] == 128 + 31: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, group_end_tile = %d, num_m_blocks=%d, num_m_blocks_cumulative = %d, m_blocks_in_group = %d", self._tile_idx, group_end_tile, num_m_blocks, num_m_blocks_cumulative, m_blocks_in_group)
+        block, head_idx, batch_idx = Int32(0), Int32(0), Int32(0)
+        next_tile_idx = self._tile_idx // params.cluster_shape_m
+        while group_end_tile <= next_tile_idx:
+            batch_idx += cute.arch.WARP_SIZE - 1
+            if batch_idx >= params.num_batch:
+                batch_idx = Int32(params.num_batch)
+                group_end_tile = next_tile_idx + 1
+            else:
+                num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=batch_idx)
+                num_m_blocks_cumulative = utils.warp_prefix_sum(num_m_blocks, lane_idx)
+                m_blocks_in_group = cute.arch.shuffle_sync(
+                    num_m_blocks_cumulative, cute.arch.WARP_SIZE - 1
+                )
+                group_end_tile += m_blocks_in_group * params.num_head
+        is_valid = False
+        if batch_idx >= params.num_batch:
+            block, head_idx, batch_idx = Int32(0), Int32(0), Int32(params.num_batch)
+        else:
+            group_start_tile = group_end_tile - m_blocks_in_group * params.num_head
+            # if cute.arch.thread_idx()[0] == 128 + 31: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, group_end_tile = %d, num_m_blocks=%d, batch_idx = %d", self._tile_idx, group_end_tile, num_m_blocks, batch_idx)
+            # The next problem to process is the first one that does not have ending tile position
+            # that is greater than or equal to tile index.
+            batch_idx_in_group = cute.arch.popc(
+                cute.arch.vote_ballot_sync(
+                    group_start_tile + num_m_blocks_cumulative * params.num_head <= next_tile_idx
+                )
+            )
+            batch_idx += batch_idx_in_group
+            num_m_blocks_prev_lane = (
+                0
+                if batch_idx_in_group == 0
+                else cute.arch.shuffle_sync(num_m_blocks_cumulative, batch_idx_in_group - 1)
+            )
+            num_m_blocks = cute.arch.shuffle_sync(num_m_blocks, batch_idx_in_group)
+            mh_block = next_tile_idx - group_start_tile - num_m_blocks_prev_lane * params.num_head
+            if cutlass.const_expr(params.lpt or params.head_swizzle):
+                # This is a version of the SingleTileLPTScheduler, complicated by the fact that
+                # the seqlen can vary per batch.
+                # TODO: is there any case where num_m_blocks is 0?
+                # TODO: by right we should read the seqlen_kv but we're assuming seqlen_q == seqlen_k here
+                num_n_blocks = (
+                    num_m_blocks
+                    * params.tile_shape_mn[0]
+                    // params.qhead_per_kvhead_packgqa
+                    // params.tile_shape_mn[1]
+                )
+                # nheads_in_l2 = min(max(self.max_kvblock_in_l2 // num_n_blocks, 1), self.num_head)
+                # Seems faster to have this be a power of 2
+                nheads_in_l2 = (
+                    16
+                    if num_n_blocks * 16 <= params.max_kvblock_in_l2
+                    else (
+                        8
+                        if num_n_blocks * 8 <= params.max_kvblock_in_l2
+                        else (
+                            4
+                            if num_n_blocks * 4 <= params.max_kvblock_in_l2
+                            else (2 if num_n_blocks * 2 <= params.max_kvblock_in_l2 else 1)
+                        )
+                    )
+                )
+                nheads_in_l2 = min(nheads_in_l2, params.num_head)
+                mh_in_l2 = nheads_in_l2 * num_m_blocks
+                section_idx = mh_block // mh_in_l2
+                l2_mod = mh_block - section_idx * mh_in_l2
+                # Deal with tail section
+                nheads_in_this_section = (
+                    nheads_in_l2
+                    if nheads_in_l2 * (section_idx + 1) <= params.num_head
+                    else params.num_head - section_idx * nheads_in_l2
+                )
+                block = l2_mod // nheads_in_this_section
+                head_idx_residual = l2_mod - block * nheads_in_this_section
+                head_idx = section_idx * nheads_in_l2 + head_idx_residual
+                if cutlass.const_expr(params.lpt):
+                    block = num_m_blocks - 1 - block
+            else:
+                head_idx = mh_block // num_m_blocks
+                block = mh_block - head_idx * num_m_blocks
+            is_valid = self._is_first_block and batch_idx < params.num_batch
+            if cutlass.const_expr(params.cluster_shape_m > 1):
+                bidx_in_cluster = cute.arch.block_in_cluster_idx()
+                block = block * params.cluster_shape_m + bidx_in_cluster[0]
+        # if cute.arch.thread_idx()[0] == 128: cute.printf("SingleTileVarlenScheduler: tile_idx=%d, batch_idx=%d, head_idx=%d, block=%d, is_valid = %d", self._tile_idx, batch_idx, head_idx, block, is_valid)
+        split_idx = self._split_idx if const_expr(params.is_split_kv) else Int32(0)
+        return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), split_idx), is_valid)
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        return self.get_current_work(loc=loc, ip=ip)
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        pass
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        # Single tile scheduler - set to invalid tile_idx to indicate no more work
+        self._is_first_block = False
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.params, self._tile_idx, self._split_idx]:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.params, self._tile_idx, self._split_idx],
+            self._values_pos,
+        ):
+            obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return SingleTileVarlenScheduler(*(tuple(obj_list)), loc=self._loc)

build/torch-cuda/utils.py ADDED Viewed

	@@ -0,0 +1,698 @@

+# Copyright (c) 2025, Tri Dao.
+import math
+import hashlib
+import inspect
+from typing import Type, Callable, Optional, Tuple, overload
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass.cute.runtime import from_dlpack
+from .quack import activation
+_MIXER_ATTRS = ("__vec_size__",)
+# Obtained from sollya:
+# fpminimax(exp(x * log(2.0)), 1, [|1,24...|],[0;1],relative);
+POLY_EX2 = {
+    0: (1.0),
+    1: (
+        1.0,
+        0.922497093677520751953125,
+    ),
+    2: (
+        1.0,
+        0.6657850742340087890625,
+        0.330107033252716064453125,
+    ),
+    3: (
+        1.0,
+        0.695146143436431884765625,
+        0.227564394474029541015625,
+        0.077119089663028717041015625,
+    ),
+    4: (
+        1.0,
+        0.693042695522308349609375,
+        0.2412912547588348388671875,
+        5.2225358784198760986328125e-2,
+        1.3434938155114650726318359375e-2,
+    ),
+    5: (
+        1.0,
+        0.693151414394378662109375,
+        0.24016360938549041748046875,
+        5.5802188813686370849609375e-2,
+        9.01452265679836273193359375e-3,
+        1.86810153536498546600341796875e-3,
+    ),
+}
+def _compute_base_hash(func: Callable) -> str:
+    """Compute hash from source code or bytecode and closure values."""
+    try:
+        data = inspect.getsource(func).encode()
+    except (OSError, TypeError):
+        if hasattr(func, "__code__") and func.__code__ is not None:
+            data = func.__code__.co_code
+        else:
+            data = repr(func).encode()
+    hasher = hashlib.sha256(data)
+    if hasattr(func, "__closure__") and func.__closure__ is not None:
+        for cell in func.__closure__:
+            hasher.update(repr(cell.cell_contents).encode())
+    return hasher.hexdigest()
+def hash_callable(
+    func: Callable, mixer_attrs: Tuple[str] = _MIXER_ATTRS, set_cute_hash: bool = True
+) -> str:
+    """Hash a callable based on the source code or bytecode and closure values.
+    Fast-path: if the callable (or its __wrapped__ base) has a ``__cute_hash__``
+    attribute, that value is returned immediately as the base hash, then
+    metadata dunders are mixed in to produce the final dict-key hash.
+    set_cute_hash: whether or not to set func.__cute_hash__
+    """
+    # Resolve base hash
+    if hasattr(func, "__cute_hash__"):
+        base_hash = func.__cute_hash__
+    else:
+        # Unwrap decorated functions (e.g., cute.jit wrappers).
+        base_func = getattr(func, "__wrapped__", func)
+        if hasattr(base_func, "__cute_hash__"):
+            base_hash = base_func.__cute_hash__
+        else:
+            base_hash = _compute_base_hash(base_func)
+            if set_cute_hash:
+                base_func.__cute_hash__ = base_hash
+    # Mix in mutable metadata dunders
+    mixer_values = tuple(getattr(func, attr, None) for attr in mixer_attrs)
+    if all(v is None for v in mixer_values):
+        return base_hash
+    hasher = hashlib.sha256(base_hash.encode())
+    for attr, val in zip(_MIXER_ATTRS, mixer_values):
+        hasher.update(f"{attr}={val!r}".encode())
+    return hasher.hexdigest()
+def create_softcap_scoremod(softcap_val):
+    inv_softcap = 1.0 / softcap_val
+    @cute.jit
+    def scoremod_premask_fn(acc_S_SSA, batch_idx, head_idx, q_idx, kv_idx, aux_tensors):
+        scores = acc_S_SSA * inv_softcap
+        return scores * cute.math.tanh(scores, fastmath=True)
+    return scoremod_premask_fn
+def convert_from_dlpack(x, leading_dim, alignment=16, divisibility=1) -> cute.Tensor:
+    return (
+        from_dlpack(x, assumed_align=alignment)
+        .mark_layout_dynamic(leading_dim=leading_dim)
+        .mark_compact_shape_dynamic(
+            mode=leading_dim, stride_order=x.dim_order(), divisibility=divisibility
+        )
+    )
+def convert_from_dlpack_leading_static(
+    x, leading_dim, alignment=16, static_modes=None, stride_order=None
+) -> cute.Tensor:
+    if stride_order is None:
+        stride_order = x.dim_order()
+    x_ = from_dlpack(x, assumed_align=alignment)
+    for i in range(x.ndim):
+        if i != leading_dim and (static_modes is None or i not in static_modes):
+            x_ = x_.mark_compact_shape_dynamic(mode=i, stride_order=stride_order)
+    return x_
+def make_tiled_copy_A(
+    copy_atom: cute.CopyAtom, tiled_mma: cute.TiledMma, swapAB: cutlass.Constexpr[bool] = False
+) -> cute.TiledCopy:
+    if const_expr(swapAB):
+        return cute.make_tiled_copy_B(copy_atom, tiled_mma)
+    else:
+        return cute.make_tiled_copy_A(copy_atom, tiled_mma)
+def make_tiled_copy_B(
+    copy_atom: cute.CopyAtom, tiled_mma: cute.TiledMma, swapAB: cutlass.Constexpr[bool] = False
+) -> cute.TiledCopy:
+    if const_expr(swapAB):
+        return cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    else:
+        return cute.make_tiled_copy_B(copy_atom, tiled_mma)
+def mma_make_fragment_A(
+    smem: cute.Tensor, thr_mma: cute.core.ThrMma, swapAB: cutlass.Constexpr[bool] = False
+) -> cute.Tensor:
+    if const_expr(swapAB):
+        return mma_make_fragment_B(smem, thr_mma)
+    else:
+        return thr_mma.make_fragment_A(thr_mma.partition_A(smem))
+def mma_make_fragment_B(
+    smem: cute.Tensor, thr_mma: cute.core.ThrMma, swapAB: cutlass.Constexpr[bool] = False
+) -> cute.Tensor:
+    if const_expr(swapAB):
+        return mma_make_fragment_A(smem, thr_mma)
+    else:
+        return thr_mma.make_fragment_B(thr_mma.partition_B(smem))
+def get_smem_store_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=2 * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+@cute.jit
+def warp_reduce(
+    val: cute.TensorSSA | cute.Numeric,
+    op: Callable,
+    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.TensorSSA | cute.Numeric:
+    if const_expr(isinstance(val, cute.TensorSSA)):
+        res = cute.make_fragment(val.shape, val.dtype)
+        res.store(val)
+        for i in cutlass.range_constexpr(cute.size(val.shape)):
+            res[i] = warp_reduce(res[i], op, width)
+        return res.load()
+    else:
+        for i in cutlass.range_constexpr(int(math.log2(width))):
+            val = op(val, cute.arch.shuffle_sync_bfly(val, offset=1 << i))
+    return val
+@dsl_user_op
+def fmax(
+    a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None
+) -> Float32:
+    from cutlass import CUDA_VERSION
+    # * NVVM call based on nvvm version
+    if CUDA_VERSION.major == 12 and CUDA_VERSION.minor == 9:
+        # Old API: requires explicit result type as first positional argument
+        return Float32(
+            nvvm.fmax(
+                T.f32(),
+                Float32(a).ir_value(loc=loc, ip=ip),
+                Float32(b).ir_value(loc=loc, ip=ip),
+                c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+                loc=loc,
+                ip=ip,
+            )
+        )
+    else:
+        # New API: infers result type automatically
+        return Float32(
+            nvvm.fmax(
+                Float32(a).ir_value(loc=loc, ip=ip),
+                Float32(b).ir_value(loc=loc, ip=ip),
+                c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+                loc=loc,
+                ip=ip,
+            )
+        )
+@cute.jit
+def fmax_reduce(
+    x: cute.TensorSSA, init_val: float | Float32 | None = None, arch: cutlass.Constexpr[int] = 80
+) -> Float32:
+    if const_expr(arch < 100 or cute.size(x.shape) % 8 != 0):
+        # if const_expr(init_val is None):
+        #     init_val = -cutlass.Float32.if
+        # return x.reduce(cute.ReductionOp.MAX, init_val, 0)
+        res = cute.make_fragment(x.shape, Float32)
+        res.store(x)
+        # local_max = [res[0], res[1]]
+        # for i in cutlass.range_constexpr(2, cute.size(x.shape), 2):
+        #     local_max[0] = fmax(local_max[0], res[i + 0])
+        #     local_max[1] = fmax(local_max[1], res[i + 1])
+        # local_max[0] = fmax(local_max[0], local_max[1])
+        # return local_max[0] if const_expr(init_val is None) else fmax(local_max[0], init_val)
+        local_max = [res[0], res[1], res[2], res[3]]
+        for i in cutlass.range_constexpr(4, cute.size(x.shape), 4):
+            local_max[0] = fmax(local_max[0], res[i + 0])
+            local_max[1] = fmax(local_max[1], res[i + 1])
+            local_max[2] = fmax(local_max[2], res[i + 2])
+            local_max[3] = fmax(local_max[3], res[i + 3])
+        local_max[0] = fmax(local_max[0], local_max[1])
+        local_max[2] = fmax(local_max[2], local_max[3])
+        local_max[0] = fmax(local_max[0], local_max[2])
+        return local_max[0] if const_expr(init_val is None) else fmax(local_max[0], init_val)
+    else:
+        # [2025-06-15] x.reduce only seems to use 50% 3-input max and 50% 2-input max
+        # We instead force the 3-input max.
+        res = cute.make_fragment(x.shape, Float32)
+        res.store(x)
+        local_max_0 = (
+            fmax(init_val, res[0], res[1])
+            if const_expr(init_val is not None)
+            else fmax(res[0], res[1])
+        )
+        local_max = [
+            local_max_0,
+            fmax(res[2], res[3]),
+            fmax(res[4], res[5]),
+            fmax(res[6], res[7]),
+        ]
+        for i in cutlass.range_constexpr(8, cute.size(x.shape), 8):
+            local_max[0] = fmax(local_max[0], res[i], res[i + 1])
+            local_max[1] = fmax(local_max[1], res[i + 2], res[i + 3])
+            local_max[2] = fmax(local_max[2], res[i + 4], res[i + 5])
+            local_max[3] = fmax(local_max[3], res[i + 6], res[i + 7])
+        local_max[0] = fmax(local_max[0], local_max[1])
+        return fmax(local_max[0], local_max[2], local_max[3])
+@cute.jit
+def fadd_reduce(
+    x: cute.TensorSSA, init_val: float | Float32 | None = None, arch: cutlass.Constexpr[int] = 80
+) -> Float32:
+    if const_expr(arch < 100 or cute.size(x.shape) % 8 != 0):
+        if const_expr(init_val is None):
+            init_val = Float32.zero
+        return x.reduce(cute.ReductionOp.ADD, init_val, 0)
+        # res = cute.make_fragment(x.shape, Float32)
+        # res.store(x)
+        # local_sum = [res[0], res[1], res[2], res[3]]
+        # for i in cutlass.range_constexpr(4, cute.size(x.shape), 4):
+        #     local_sum[0] += res[i + 0]
+        #     local_sum[1] += res[i + 1]
+        #     local_sum[2] += res[i + 2]
+        #     local_sum[3] += res[i + 3]
+        # local_sum[0] += local_sum[1]
+        # local_sum[2] += local_sum[3]
+        # local_sum[0] += local_sum[2]
+        # return local_sum[0] if const_expr(init_val is None) else local_sum[0] + init_val
+    else:
+        res = cute.make_fragment(x.shape, Float32)
+        res.store(x)
+        local_sum_0 = (
+            cute.arch.add_packed_f32x2((init_val, 0.0), (res[0], res[1]))
+            # cute.arch.add_packed_f32x2((init_val / 2, init_val / 2), (res[0], res[1]))
+            if const_expr(init_val is not None)
+            else (res[0], res[1])
+        )
+        local_sum = [local_sum_0, (res[2], res[3]), (res[4], res[5]), (res[6], res[7])]
+        for i in cutlass.range_constexpr(8, cute.size(x.shape), 8):
+            local_sum[0] = cute.arch.add_packed_f32x2(local_sum[0], (res[i + 0], res[i + 1]))
+            local_sum[1] = cute.arch.add_packed_f32x2(local_sum[1], (res[i + 2], res[i + 3]))
+            local_sum[2] = cute.arch.add_packed_f32x2(local_sum[2], (res[i + 4], res[i + 5]))
+            local_sum[3] = cute.arch.add_packed_f32x2(local_sum[3], (res[i + 6], res[i + 7]))
+        local_sum[0] = cute.arch.add_packed_f32x2(local_sum[0], local_sum[1])
+        local_sum[2] = cute.arch.add_packed_f32x2(local_sum[2], local_sum[3])
+        local_sum[0] = cute.arch.add_packed_f32x2(local_sum[0], local_sum[2])
+        return local_sum[0][0] + local_sum[0][1]
+@dsl_user_op
+def atomic_add_fp32(a: float | Float32, gmem_ptr: cute.Pointer, *, loc=None, ip=None) -> None:
+    # gmem_ptr_i64 = gmem_ptr.toint(loc=loc, ip=ip).ir_value()
+    # # cache_hint = cutlass.Int64(0x12F0000000000000)
+    # llvm.inline_asm(
+    #     None,
+    #     [gmem_ptr_i64, Float32(a).ir_value(loc=loc, ip=ip)],
+    #     # [gmem_ptr_i64, Float32(a).ir_value(loc=loc, ip=ip), cache_hint.ir_value()],
+    #     "red.global.add.f32 [$0], $1;",
+    #     # "red.global.add.L2::cache_hint.f32 [$0], $1, 0x12F0000000000000;",
+    #     # "red.global.add.L2::cache_hint.f32 [$0], $1, $2;",
+    #     "l,f",
+    #     # "l,f,l",
+    #     has_side_effects=True,
+    #     is_align_stack=False,
+    #     asm_dialect=llvm.AsmDialect.AD_ATT,
+    # )
+    nvvm.atomicrmw(
+        res=T.f32(), op=nvvm.AtomicOpKind.FADD, ptr=gmem_ptr.llvm_ptr, a=Float32(a).ir_value()
+    )
+@dsl_user_op
+def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
+    return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
+@cute.jit
+def predicate_k(tAcA: cute.Tensor, limit: cutlass.Int32) -> cute.Tensor:
+    # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
+    tApA = cute.make_fragment(
+        cute.make_layout(
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            stride=(cute.size(tAcA, mode=[2]), 0, 1),
+        ),
+        cutlass.Boolean,
+    )
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+    return tApA
+def canonical_warp_group_idx(sync: bool = True) -> cutlass.Int32:
+    warp_group_idx = cute.arch.thread_idx()[0] // 128
+    if const_expr(sync):
+        warp_group_idx = cute.arch.make_warp_uniform(warp_group_idx)
+    return warp_group_idx
+# @dsl_user_op
+# def warp_vote_any_lt(a: float | Float32, b: float | Float32, *, loc=None, ip=None) -> cutlass.Boolean:
+#     mask = cutlass.Int32(-1)
+#     return cutlass.Boolean(
+#         llvm.inline_asm(
+#             T.i32(),
+#             [Float32(a).ir_value(loc=loc, ip=ip), Float32(b).ir_value(loc=loc, ip=ip), mask.ir_value(loc=loc, ip=ip)],
+#             ".pred p1, p2;\n"
+#             "setp.lt.f32 p1, $1, $2;\n"
+#             "vote.sync.any.pred p2, p1, $3;\n"
+#             "selp.u32 $0, 1, 0, p2;",
+#             # "selp.u32 $0, 1, 0, p1;",
+#             "=r,f,f,r",
+#             has_side_effects=False,
+#             is_align_stack=False,
+#             asm_dialect=llvm.AsmDialect.AD_ATT,
+#         )
+#     )
+@cute.jit
+def shuffle_sync(
+    value: cute.Numeric,
+    offset: cute.typing.Int,
+    width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.Numeric:
+    assert value.width % 32 == 0, "value type must be a multiple of 32 bits"
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    # important: need stride 1 and not 0 for recast_tensor to work
+    val = cute.make_rmem_tensor(cute.make_layout((1,), stride=(1,)), type(value))
+    val[0] = value
+    val_i32 = cute.recast_tensor(val, cutlass.Int32)
+    for i in cutlass.range_constexpr(cute.size(val_i32)):
+        val_i32[i] = cute.arch.shuffle_sync(val_i32[i], offset, mask_and_clamp=mask_and_clamp)
+    return val[0]
+@dsl_user_op
+def shr_u32(val: cutlass.Uint32, shift: cutlass.Uint32, *, loc=None, ip=None) -> cutlass.Uint32:
+    return cutlass.Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [
+                cutlass.Uint32(val).ir_value(loc=loc, ip=ip),
+                cutlass.Uint32(shift).ir_value(loc=loc, ip=ip),
+            ],
+            "shr.s32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@cute.jit
+def warp_prefix_sum(val: cutlass.Int32, lane: Optional[cutlass.Int32] = None) -> cutlass.Int32:
+    if const_expr(lane is None):
+        lane = cute.arch.lane_idx()
+    # if cute.arch.thread_idx()[0] >= 128 and cute.arch.thread_idx()[0] < 128 + 32 and cute.arch.block_idx()[0] == 0: cute.printf("tidx = %d, val = %d", cute.arch.thread_idx()[0] % 32, val)
+    for i in cutlass.range_constexpr(int(math.log2(cute.arch.WARP_SIZE))):
+        offset = 1 << i
+        # Very important that we set mask_and_clamp to 0
+        partial_sum = cute.arch.shuffle_sync_up(val, offset=offset, mask_and_clamp=0)
+        if lane >= offset:
+            val += partial_sum
+        # if cute.arch.thread_idx()[0] >= 128 and cute.arch.thread_idx()[0] < 128 + 32 and cute.arch.block_idx()[0] == 0: cute.printf("tidx = %d, partial_sum = %d, val = %d", cute.arch.thread_idx()[0] % 32, partial_sum, val)
+    return val
+@dsl_user_op
+def cvt_f16x2_f32(
+    a: float | Float32, b: float | Float32, to_dtype: Type, *, loc=None, ip=None
+) -> cutlass.Int32:
+    assert to_dtype in [cutlass.BFloat16, cutlass.Float16], "to_dtype must be BFloat16 or Float16"
+    return cutlass.Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [Float32(a).ir_value(loc=loc, ip=ip), Float32(b).ir_value(loc=loc, ip=ip)],
+            f"cvt.rn.{'bf16x2' if to_dtype is cutlass.BFloat16 else 'f16x2'}.f32 $0, $2, $1;",
+            "=r,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@overload
+def cvt_f16(src: cute.Tensor, dst: cute.Tensor) -> None: ...
+@overload
+def cvt_f16(src: cute.Tensor, dtype: Type[cute.Numeric]) -> cute.Tensor: ...
+@cute.jit
+def cvt_f16(src: cute.Tensor, dst_or_dtype):
+    """Convert Float32 tensor to Float16/BFloat16.
+    Args:
+        src: Source tensor with Float32 element type
+        dst_or_dtype: Either a destination tensor or a dtype (Float16/BFloat16)
+    Returns:
+        None if dst is a tensor, or a new tensor if dtype is provided
+    """
+    if const_expr(isinstance(dst_or_dtype, type)):
+        # dtype variant: create new tensor and call the tensor variant
+        dtype = dst_or_dtype
+        dst = cute.make_fragment(src.shape, dtype)
+        cvt_f16(src, dst)
+        return dst
+    else:
+        # tensor variant: write to dst
+        dst = dst_or_dtype
+        assert cute.size(dst.shape) == cute.size(src.shape), "dst and src must have the same size"
+        assert cute.size(src.shape) % 2 == 0, "src must have an even number of elements"
+        assert dst.element_type in [cutlass.BFloat16, cutlass.Float16], (
+            "dst must be BFloat16 or Float16"
+        )
+        assert src.element_type is Float32, "src must be Float32"
+        dst_i32 = cute.recast_tensor(dst, cutlass.Int32)
+        assert cute.size(dst_i32.shape) * 2 == cute.size(src.shape)
+        for i in cutlass.range_constexpr(cute.size(dst_i32)):
+            dst_i32[i] = cvt_f16x2_f32(src[2 * i], src[2 * i + 1], dst.element_type)
+@dsl_user_op
+@cute.jit
+def evaluate_polynomial(x: Float32, poly: Tuple[Float32, ...], *, loc=None, ip=None) -> Float32:
+    deg = len(poly) - 1
+    out = poly[deg]
+    for i in cutlass.range_constexpr(deg - 1, -1, -1):
+        out = out * x + poly[i]
+    return out
+@dsl_user_op
+@cute.jit
+def evaluate_polynomial_2(
+    x: Float32, y: Float32, poly: Tuple[Float32, ...], *, loc=None, ip=None
+) -> Tuple[Float32, Float32]:
+    deg = len(poly) - 1
+    out = (poly[deg], poly[deg])
+    for i in cutlass.range_constexpr(deg - 1, -1, -1):
+        out = cute.arch.fma_packed_f32x2(out, (x, y), (poly[i], poly[i]))
+    return out
+@dsl_user_op
+def add_round_down(x: float | Float32, y: float | Float32, *, loc=None, ip=None) -> Float32:
+    # There's probably a way to call llvm or nvvm to do this instead of ptx
+    return cutlass.Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(x).ir_value(loc=loc, ip=ip), Float32(y).ir_value(loc=loc, ip=ip)],
+            "add.rm.ftz.f32 $0, $1, $2;",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def combine_int_frac_ex2(x_rounded: Float32, frac_ex2: Float32, *, loc=None, ip=None) -> Float32:
+    return cutlass.Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [
+                Float32(x_rounded).ir_value(loc=loc, ip=ip),
+                Float32(frac_ex2).ir_value(loc=loc, ip=ip),
+            ],
+            "{\n\t"
+            ".reg .s32 x_rounded_i, frac_ex_i, x_rounded_e, out_i;\n\t"
+            "mov.b32 x_rounded_i, $1;\n\t"
+            "mov.b32 frac_ex_i, $2;\n\t"
+            "shl.b32 x_rounded_e, x_rounded_i, 23;\n\t"
+            # add.u32 generates IMAD instruction and add.s32 generates LEA instruction
+            # IMAD uses the FMA pipeline and LEA uses the ALU pipeline, afaik
+            "add.s32 out_i, x_rounded_e, frac_ex_i;\n\t"
+            "mov.b32 $0, out_i;\n\t"
+            "}\n",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def ex2_emulation(x: Float32, *, poly_degree: int = 3, loc=None, ip=None) -> Float32:
+    assert poly_degree in POLY_EX2, f"Polynomial degree {poly_degree} not supported"
+    # We assume x <= 127.0
+    fp32_round_int = float(2**23 + 2**22)
+    x_clamped = cute.arch.fmax(x, -127.0)
+    # We want to round down here, so that the fractional part is in [0, 1)
+    x_rounded = add_round_down(x_clamped, fp32_round_int, loc=loc, ip=ip)
+    # The integer floor of x is now in the last 8 bits of x_rounded
+    # We assume the next 2 ops round to nearest even. The rounding mode is important.
+    x_rounded_back = x_rounded - fp32_round_int
+    x_frac = x_clamped - x_rounded_back
+    x_frac_ex2 = evaluate_polynomial(x_frac, POLY_EX2[poly_degree], loc=loc, ip=ip)
+    return combine_int_frac_ex2(x_rounded, x_frac_ex2, loc=loc, ip=ip)
+# TODO: check that the ex2_emulation_2 produces the same SASS as the ptx version
+@dsl_user_op
+def ex2_emulation_2(
+    x: Float32, y: Float32, *, poly_degree: int = 3, loc=None, ip=None
+) -> Tuple[Float32, Float32]:
+    # We assume x <= 127.0 and y <= 127.0
+    fp32_round_int = float(2**23 + 2**22)
+    xy_clamped = (cute.arch.fmax(x, -127.0), cute.arch.fmax(y, -127.0))
+    # We want to round down here, so that the fractional part is in [0, 1)
+    xy_rounded = cute.arch.add_packed_f32x2(xy_clamped, (fp32_round_int, fp32_round_int), rnd="rm")
+    # The integer floor of x & y are now in the last 8 bits of xy_rounded
+    # We want the next 2 ops to round to nearest even. The rounding mode is important.
+    xy_rounded_back = activation.sub_packed_f32x2(
+        xy_rounded, (fp32_round_int, fp32_round_int)
+    )
+    xy_frac = activation.sub_packed_f32x2(xy_clamped, xy_rounded_back)
+    xy_frac_ex2 = evaluate_polynomial_2(*xy_frac, POLY_EX2[poly_degree], loc=loc, ip=ip)
+    x_out = combine_int_frac_ex2(xy_rounded[0], xy_frac_ex2[0], loc=loc, ip=ip)
+    y_out = combine_int_frac_ex2(xy_rounded[1], xy_frac_ex2[1], loc=loc, ip=ip)
+    return x_out, y_out
+@dsl_user_op
+def e2e_asm2(x: Float32, y: Float32, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+    out_f32x2 = llvm.inline_asm(
+        llvm.StructType.get_literal([T.f32(), T.f32()]),
+        [Float32(x).ir_value(loc=loc, ip=ip), Float32(y, loc=loc, ip=ip).ir_value()],
+        "{\n\t"
+        ".reg .f32 f1, f2, f3, f4, f5, f6, f7;\n\t"
+        ".reg .b64 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;\n\t"
+        ".reg .s32 r1, r2, r3, r4, r5, r6, r7, r8;\n\t"
+        "max.ftz.f32 f1, $2, 0fC2FE0000;\n\t"
+        "max.ftz.f32 f2, $3, 0fC2FE0000;\n\t"
+        "mov.b64 l1, {f1, f2};\n\t"
+        "mov.f32 f3, 0f4B400000;\n\t"
+        "mov.b64 l2, {f3, f3};\n\t"
+        "add.rm.ftz.f32x2 l7, l1, l2;\n\t"
+        "sub.rn.ftz.f32x2 l8, l7, l2;\n\t"
+        "sub.rn.ftz.f32x2 l9, l1, l8;\n\t"
+        "mov.f32 f7, 0f3D9DF09D;\n\t"
+        "mov.b64 l6, {f7, f7};\n\t"
+        "mov.f32 f6, 0f3E6906A4;\n\t"
+        "mov.b64 l5, {f6, f6};\n\t"
+        "mov.f32 f5, 0f3F31F519;\n\t"
+        "mov.b64 l4, {f5, f5};\n\t"
+        "mov.f32 f4, 0f3F800000;\n\t"
+        "mov.b64 l3, {f4, f4};\n\t"
+        "fma.rn.ftz.f32x2 l10, l9, l6, l5;\n\t"
+        "fma.rn.ftz.f32x2 l10, l10, l9, l4;\n\t"
+        "fma.rn.ftz.f32x2 l10, l10, l9, l3;\n\t"
+        "mov.b64 {r1, r2}, l7;\n\t"
+        "mov.b64 {r3, r4}, l10;\n\t"
+        "shl.b32 r5, r1, 23;\n\t"
+        "add.s32 r7, r5, r3;\n\t"
+        "shl.b32 r6, r2, 23;\n\t"
+        "add.s32 r8, r6, r4;\n\t"
+        "mov.b32 $0, r7;\n\t"
+        "mov.b32 $1, r8;\n\t"
+        "}\n",
+        "=r,=r,f,f",
+        has_side_effects=False,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+    out0 = Float32(llvm.extractvalue(T.f32(), out_f32x2, [0], loc=loc, ip=ip))
+    out1 = Float32(llvm.extractvalue(T.f32(), out_f32x2, [1], loc=loc, ip=ip))
+    return out0, out1
+@dsl_user_op
+def domain_offset_aligned(
+    coord: cute.Coord, tensor: cute.Tensor, *, loc=None, ip=None
+) -> cute.Tensor:
+    assert isinstance(tensor.iterator, cute.Pointer)
+    # We assume that applying the offset does not change the pointer alignment
+    new_ptr = cute.make_ptr(
+        tensor.element_type,
+        elem_pointer(tensor, coord).toint(),
+        tensor.memspace,
+        assumed_align=tensor.iterator.alignment,
+    )
+    return cute.make_tensor(new_ptr, tensor.layout)
+@cute.jit
+def scalar_to_ssa(a: cute.Numeric, dtype) -> cute.TensorSSA:
+    """Convert a scalar to a cute TensorSSA of shape (1,) and given dtype"""
+    vec = cute.make_fragment(1, dtype)
+    vec[0] = a
+    return vec.load()
+def ssa_to_scalar(val):
+    """Could inline but nice for reflecting the above api"""
+    return val[0]