Kernels:

kernels-community
/

flash-attn4

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on Apr 14

Commit

2b537bb

verified ·

1 Parent(s): 1890a47

Uploaded using `kernel-builder`.

Browse files

Files changed (35) hide show

build/torch-cuda/__init__.py +2 -6
build/torch-cuda/_ops.py +2 -2
build/torch-cuda/bench_utils.py +196 -0
build/torch-cuda/block_info.py +35 -4
build/torch-cuda/block_sparse_utils.py +19 -61
build/torch-cuda/block_sparsity.py +33 -10
build/torch-cuda/cache_utils.py +9 -15
build/torch-cuda/cute_dsl_utils.py +0 -38
build/torch-cuda/fa_logging.py +97 -0
build/torch-cuda/flash_bwd.py +25 -6
build/torch-cuda/flash_bwd_postprocess.py +31 -29
build/torch-cuda/flash_bwd_preprocess.py +146 -157
build/torch-cuda/flash_bwd_sm100.py +9 -6
build/torch-cuda/flash_bwd_sm120.py +55 -0
build/torch-cuda/flash_bwd_sm90.py +451 -151
build/torch-cuda/flash_fwd.py +132 -1363
build/torch-cuda/flash_fwd_combine.py +62 -56
build/torch-cuda/flash_fwd_sm100.py +406 -257
build/torch-cuda/flash_fwd_sm120.py +59 -0
build/torch-cuda/flash_fwd_sm90.py +1534 -0
build/torch-cuda/interface.py +734 -505
build/torch-cuda/mask.py +168 -110
build/torch-cuda/named_barrier.py +15 -0
build/torch-cuda/pack_gqa.py +110 -12
build/torch-cuda/paged_kv.py +35 -15
build/torch-cuda/pipeline.py +198 -236
build/torch-cuda/quack/copy_utils.py +186 -8
build/torch-cuda/quack/cute_dsl_utils.py +20 -26
build/torch-cuda/quack/layout_utils.py +34 -0
build/torch-cuda/quack/utils.py +324 -0
build/torch-cuda/seqlen_info.py +188 -39
build/torch-cuda/sm90_config_search.py +402 -0
build/torch-cuda/softmax.py +1 -1
build/torch-cuda/tile_scheduler.py +419 -59
build/torch-cuda/utils.py +104 -2

build/torch-cuda/__init__.py CHANGED Viewed

@@ -1,19 +1,15 @@
 """Flash Attention CUTE (CUDA Template Engine) implementation."""
-from importlib.metadata import PackageNotFoundError, version
-# Update when syncing again.
-__version__ = "4.0.0.beta4"
 import cutlass.cute as cute
 from .interface import (
     flash_attn_func,
     flash_attn_varlen_func,
 )
-from .cute_dsl_utils import cute_compile_patched
 # Patch cute.compile to optionally dump SASS
 cute.compile = cute_compile_patched

 """Flash Attention CUTE (CUDA Template Engine) implementation."""
+__version__ = "4.0.0.beta8"
 import cutlass.cute as cute
+from .cute_dsl_utils import cute_compile_patched
 from .interface import (
     flash_attn_func,
     flash_attn_varlen_func,
 )
 # Patch cute.compile to optionally dump SASS
 cute.compile = cute_compile_patched

build/torch-cuda/_ops.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
-ops = torch.ops._flash_attn4_525b056
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn4_525b056::{op_name}"

 import torch
+ops = torch.ops._flash_attn4_c9a1374
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn4_c9a1374::{op_name}"

build/torch-cuda/bench_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""Shared benchmark utilities: attention_ref, cuDNN helpers, flops calculation."""
+import math
+import torch
+try:
+    import cudnn
+except ImportError:
+    cudnn = None
+# ── FLOPS calculation ────────────────────────────────────────────────────────
+def flops(
+    batch, nheads, seqlen_q, seqlen_k, headdim, headdim_v, causal=False, window_size=(None, None)
+):
+    if causal:
+        avg_seqlen = (max(0, seqlen_k - seqlen_q) + seqlen_k) / 2
+    else:
+        if window_size == (None, None):
+            avg_seqlen = seqlen_k
+        else:
+            row_idx = torch.arange(seqlen_q, device="cuda")
+            col_left = (
+                torch.maximum(row_idx + seqlen_k - seqlen_q - window_size[0], torch.tensor(0))
+                if window_size[0] is not None
+                else torch.zeros_like(row_idx)
+            )
+            col_right = (
+                torch.minimum(
+                    row_idx + seqlen_k - seqlen_q + window_size[1], torch.tensor(seqlen_k - 1)
+                )
+                if window_size[1] is not None
+                else torch.full_like(row_idx, seqlen_k - 1)
+            )
+            avg_seqlen = (col_right - col_left + 1).float().mean().item()
+    return batch * nheads * 2 * seqlen_q * avg_seqlen * (headdim + headdim_v)
+# ── Reference attention ─────────────────────────────────────────────────────
+_attention_ref_mask_cache = {}
+def attention_ref(q, k, v, causal=False):
+    """Standard attention reference implementation.
+    Args:
+        q, k, v: (batch, seqlen, nheads, headdim) tensors.
+        causal: whether to apply causal mask.
+    """
+    softmax_scale = 1.0 / math.sqrt(q.shape[-1])
+    scores = torch.einsum("bthd,bshd->bhts", q * softmax_scale, k)
+    if causal:
+        if scores.shape[-2] not in _attention_ref_mask_cache:
+            mask = torch.tril(
+                torch.ones(scores.shape[-2:], device=scores.device, dtype=torch.bool), diagonal=0
+            )
+            _attention_ref_mask_cache[scores.shape[-2]] = mask
+        else:
+            mask = _attention_ref_mask_cache[scores.shape[-2]]
+        scores = scores.masked_fill(mask, float("-inf"))
+    attn = torch.softmax(scores, dim=-1)
+    return torch.einsum("bhts,bshd->bthd", attn, v)
+# ── cuDNN graph helpers ─────────────────────────────────────────────────────
+_TORCH_TO_CUDNN_DTYPE = {
+    torch.float16: "HALF",
+    torch.bfloat16: "BFLOAT16",
+    torch.float32: "FLOAT",
+    torch.int32: "INT32",
+    torch.int64: "INT64",
+}
+def _build_cudnn_graph(io_dtype, tensors, build_fn):
+    """Build a cuDNN graph.  Returns (graph, variant_pack, workspace)."""
+    assert cudnn is not None, "cuDNN is not available"
+    cudnn_dtype = getattr(cudnn.data_type, _TORCH_TO_CUDNN_DTYPE[io_dtype])
+    graph = cudnn.pygraph(
+        io_data_type=cudnn_dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+    graph_tensors = {name: graph.tensor_like(t.detach()) for name, t in tensors.items()}
+    variant_pack = build_fn(graph, graph_tensors)
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    graph.check_support()
+    graph.build_plans()
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    return graph, variant_pack, workspace
+def cudnn_fwd_setup(q, k, v, causal=False, window_size_left=None):
+    """Build a cuDNN forward SDPA graph.
+    Args:
+        q, k, v: (batch, nheads, seqlen, headdim) tensors (cuDNN layout).
+        causal: whether to apply causal mask.
+        window_size_left: sliding window size (None for no window).
+    Returns:
+        (fwd_fn, o_gpu, stats_gpu) where fwd_fn is a zero-arg callable.
+    """
+    b, nheads, seqlen_q, headdim = q.shape
+    headdim_v = v.shape[-1]
+    o_gpu = torch.empty(b, nheads, seqlen_q, headdim_v, dtype=q.dtype, device=q.device)
+    stats_gpu = torch.empty(b, nheads, seqlen_q, 1, dtype=torch.float32, device=q.device)
+    def build(graph, gt):
+        o, stats = graph.sdpa(
+            name="sdpa",
+            q=gt["q"],
+            k=gt["k"],
+            v=gt["v"],
+            is_inference=False,
+            attn_scale=1.0 / math.sqrt(headdim),
+            use_causal_mask=causal or window_size_left is not None,
+            sliding_window_length=window_size_left
+            if window_size_left is not None and not causal
+            else None,
+        )
+        o.set_output(True).set_dim(o_gpu.shape).set_stride(o_gpu.stride())
+        stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+        return {gt["q"]: q, gt["k"]: k, gt["v"]: v, o: o_gpu, stats: stats_gpu}
+    graph, variant_pack, workspace = _build_cudnn_graph(q.dtype, {"q": q, "k": k, "v": v}, build)
+    def fwd_fn():
+        graph.execute(variant_pack, workspace)
+        return o_gpu
+    return fwd_fn, o_gpu, stats_gpu
+def cudnn_bwd_setup(q, k, v, o, g, lse, causal=False, window_size_left=None):
+    """Build a cuDNN backward SDPA graph.
+    Args:
+        q, k, v, o, g, lse: (batch, nheads, seqlen, dim) tensors (cuDNN layout).
+        causal: whether to apply causal mask.
+        window_size_left: sliding window size (None for no window).
+    Returns:
+        bwd_fn: zero-arg callable that returns (dq, dk, dv).
+    """
+    headdim = q.shape[-1]
+    dq_gpu, dk_gpu, dv_gpu = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+    def build(graph, gt):
+        dq, dk, dv = graph.sdpa_backward(
+            name="sdpa_backward",
+            q=gt["q"],
+            k=gt["k"],
+            v=gt["v"],
+            o=gt["o"],
+            dO=gt["g"],
+            stats=gt["lse"],
+            attn_scale=1.0 / math.sqrt(headdim),
+            use_causal_mask=causal or window_size_left is not None,
+            sliding_window_length=window_size_left
+            if window_size_left is not None and not causal
+            else None,
+            use_deterministic_algorithm=False,
+        )
+        dq.set_output(True).set_dim(dq_gpu.shape).set_stride(dq_gpu.stride())
+        dk.set_output(True).set_dim(dk_gpu.shape).set_stride(dk_gpu.stride())
+        dv.set_output(True).set_dim(dv_gpu.shape).set_stride(dv_gpu.stride())
+        return {
+            gt["q"]: q,
+            gt["k"]: k,
+            gt["v"]: v,
+            gt["o"]: o,
+            gt["g"]: g,
+            gt["lse"]: lse,
+            dq: dq_gpu,
+            dk: dk_gpu,
+            dv: dv_gpu,
+        }
+    graph, variant_pack, workspace = _build_cudnn_graph(
+        q.dtype,
+        {"q": q, "k": k, "v": v, "o": o, "g": g, "lse": lse},
+        build,
+    )
+    def bwd_fn():
+        graph.execute(variant_pack, workspace)
+        return dq_gpu, dk_gpu, dv_gpu
+    return bwd_fn

build/torch-cuda/block_info.py CHANGED Viewed

@@ -6,7 +6,7 @@ import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, const_expr
-from .seqlen_info import SeqlenInfoQK
 @dataclass(frozen=True)
@@ -25,8 +25,8 @@ class BlockInfo:
         self,
         seqlen_info: SeqlenInfoQK,
         m_block: Int32,
-        split_idx: cutlass.Int32 = 0,
-        num_splits: cutlass.Int32 = 1,
     ) -> Tuple[Int32, Int32]:
         n_block_max = cute.ceil_div(seqlen_info.seqlen_k, self.tile_n)
         if const_expr(self.is_causal or (self.is_local and self.window_size_right is not None)):
@@ -46,7 +46,7 @@ class BlockInfo:
             n_block_min = cutlass.max(n_idx_left // self.tile_n, 0)
         if cutlass.const_expr(self.is_split_kv):
             num_n_blocks_per_split = (
-                cutlass.Int32(0)
                 if n_block_max <= n_block_min
                 else (n_block_max - n_block_min + num_splits - 1) // num_splits
             )
@@ -70,6 +70,37 @@ class BlockInfo:
             m_block_max = min(m_block_max, cute.ceil_div(m_idx_left, self.tile_m))
         return m_block_min, m_block_max
     @cute.jit
     def get_n_block_min_causal_local_mask(
         self,

 import cutlass.cute as cute
 from cutlass import Int32, const_expr
+from .seqlen_info import SeqlenInfoQK, SeqlenInfoQKNewK
 @dataclass(frozen=True)
         self,
         seqlen_info: SeqlenInfoQK,
         m_block: Int32,
+        split_idx: Int32 = 0,
+        num_splits: Int32 = 1,
     ) -> Tuple[Int32, Int32]:
         n_block_max = cute.ceil_div(seqlen_info.seqlen_k, self.tile_n)
         if const_expr(self.is_causal or (self.is_local and self.window_size_right is not None)):
             n_block_min = cutlass.max(n_idx_left // self.tile_n, 0)
         if cutlass.const_expr(self.is_split_kv):
             num_n_blocks_per_split = (
+                Int32(0)
                 if n_block_max <= n_block_min
                 else (n_block_max - n_block_min + num_splits - 1) // num_splits
             )
             m_block_max = min(m_block_max, cute.ceil_div(m_idx_left, self.tile_m))
         return m_block_min, m_block_max
+    @cute.jit
+    def get_n_block_k_new_min_max(
+        self,
+        seqlen_info: SeqlenInfoQKNewK,
+        m_block: Int32,
+        split_idx: Int32 = 0,
+        num_splits: Int32 = 1,
+    ) -> Tuple[Int32, Int32]:
+        """Get the block range for new K tokens (append KV).
+        First computes the full n_block range via get_n_block_min_max, then maps
+        those blocks into the new-K index space by subtracting seqlen_k_og.
+        """
+        n_block_min, n_block_max = self.get_n_block_min_max(
+            seqlen_info,
+            m_block,
+            split_idx,
+            num_splits,
+        )
+        idx_k_new_min = cutlass.max(n_block_min * self.tile_n - seqlen_info.seqlen_k_og, 0)
+        idx_k_new_max = cutlass.min(
+            n_block_max * self.tile_n - seqlen_info.seqlen_k_og, seqlen_info.seqlen_k_new
+        )
+        n_block_new_min = idx_k_new_min // self.tile_n
+        n_block_new_max = (
+            cute.ceil_div(idx_k_new_max, self.tile_n)
+            if idx_k_new_max > idx_k_new_min
+            else n_block_new_min
+        )
+        return n_block_new_min, n_block_new_max
     @cute.jit
     def get_n_block_min_causal_local_mask(
         self,

build/torch-cuda/block_sparse_utils.py CHANGED Viewed

@@ -72,24 +72,22 @@ from .named_barrier import NamedBarrierBwd
 def load_block_list(
     block_indices: cute.Tensor,
     block_count,
-    load_q_with_first: cutlass.Constexpr,
     first_block_preloaded: cutlass.Constexpr,
     kv_producer_state,
-    load_Q,
     load_K,
     load_V,
     pipeline_k,
     pipeline_v,
-    use_tma_q: cutlass.Constexpr,
-    tma_q_bytes: cutlass.Constexpr,
     intra_wg_overlap: cutlass.Constexpr,
 ):
-    """Iterate over the sparse blocks and load K, V (and Q) into the pipeline.
-    for the intra_wg_overlap case, we overlap the loads of K and V. And this
     means we need to pipeline the last V load from the partial block case,
     with the loads for the full blocks. Set first_block_preloaded when the
     caller has already issued the first K load for the list.
     Note:
         we iterate along the block_n indices in reverse.
@@ -99,21 +97,7 @@ def load_block_list(
     """
     if block_count > 0:
         if const_expr(not intra_wg_overlap):
-            # Peel first iteration: the first block may need to load Q alongside K,
-            # Parameters are already Constexpr, so no need to wrap in const_expr()
-            n_block_first = block_indices[block_count - 1]
-            extra_tx = tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
-            pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
-            if const_expr(load_q_with_first and use_tma_q):
-                load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
-            load_K(src_idx=n_block_first, producer_state=kv_producer_state)
-            pipeline_v.producer_acquire(kv_producer_state)
-            load_V(src_idx=n_block_first, producer_state=kv_producer_state)
-            kv_producer_state.advance()
-            for offset in cutlass.range(1, block_count):
                 n_block = block_indices[block_count - 1 - offset]
                 pipeline_k.producer_acquire(kv_producer_state)
                 load_K(src_idx=n_block, producer_state=kv_producer_state)
@@ -123,14 +107,7 @@ def load_block_list(
         else:
             n_block_first = block_indices[block_count - 1]
             if const_expr(not first_block_preloaded):
-                extra_tx = (
-                    tma_q_bytes if const_expr(load_q_with_first) and const_expr(use_tma_q) else 0
-                )
-                pipeline_k.producer_acquire(kv_producer_state, extra_tx_count=extra_tx)
-                if const_expr(load_q_with_first and use_tma_q):
-                    load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
                 load_K(src_idx=n_block_first, producer_state=kv_producer_state)
             for idx in cutlass.range(block_count - 1, unroll=1):
@@ -186,19 +163,18 @@ def produce_block_sparse_loads(
     head_idx,
     m_block,
     kv_producer_state,
-    load_Q,
     load_K,
     load_V,
     pipeline_k,
     pipeline_v,
-    use_tma_q: cutlass.Constexpr,
-    tma_q_bytes: cutlass.Constexpr,
     intra_wg_overlap: cutlass.Constexpr,
     qhead_per_kvhead: cutlass.Constexpr[int] = 1,
     q_subtile_factor: cutlass.Constexpr[int] = 1,
 ):
     """Iterate over the mask and full block lists for a single tile.
     The masked (partial) list may leave the last V load pending when intra-warp-group
     overlap is enabled. The first full block must consume that pending V while
     issuing its own K load on the next pipeline stage.
@@ -230,20 +206,16 @@ def produce_block_sparse_loads(
     full_empty = curr_full_block_cnt == 0
     if mask_empty:
-        # No masked blocks: the full list owns the initial Q+K load.
         kv_producer_state = load_block_list(
             curr_full_block_idx,
             curr_full_block_cnt,
-            load_q_with_first=True,
             first_block_preloaded=False,
             kv_producer_state=kv_producer_state,
-            load_Q=load_Q,
             load_K=load_K,
             load_V=load_V,
             pipeline_k=pipeline_k,
             pipeline_v=pipeline_v,
-            use_tma_q=use_tma_q,
-            tma_q_bytes=tma_q_bytes,
             intra_wg_overlap=intra_wg_overlap,
         )
@@ -256,21 +228,16 @@ def produce_block_sparse_loads(
                 kv_producer_state,
             )
     else:
-        # Masked blocks present: load Q together with the first masked K so consumers can
-        # start immediately. When overlap is disabled this fully drains the list.
         kv_producer_state = load_block_list(
             curr_mask_block_idx,
             curr_mask_block_cnt,
-            load_q_with_first=True,
             first_block_preloaded=False,
             kv_producer_state=kv_producer_state,
-            load_Q=load_Q,
             load_K=load_K,
             load_V=load_V,
             pipeline_k=pipeline_k,
             pipeline_v=pipeline_v,
-            use_tma_q=use_tma_q,
-            tma_q_bytes=tma_q_bytes,
             intra_wg_overlap=intra_wg_overlap,
         )
@@ -299,16 +266,12 @@ def produce_block_sparse_loads(
                 kv_producer_state = load_block_list(
                     curr_full_block_idx,
                     curr_full_block_cnt,
-                    load_q_with_first=False,
                     first_block_preloaded=True,
                     kv_producer_state=kv_producer_state,
-                    load_Q=load_Q,
                     load_K=load_K,
                     load_V=load_V,
                     pipeline_k=pipeline_k,
                     pipeline_v=pipeline_v,
-                    use_tma_q=use_tma_q,
-                    tma_q_bytes=tma_q_bytes,
                     intra_wg_overlap=intra_wg_overlap,
                 )
@@ -320,21 +283,16 @@ def produce_block_sparse_loads(
                     kv_producer_state,
                 )
             else:
-                # Non-overlap path with both lists: run the full list normally (skipping the Q
-                # reload because the masked list already issued it).
                 kv_producer_state = load_block_list(
                     curr_full_block_idx,
                     curr_full_block_cnt,
-                    load_q_with_first=False,
                     first_block_preloaded=False,
                     kv_producer_state=kv_producer_state,
-                    load_Q=load_Q,
                     load_K=load_K,
                     load_V=load_V,
                     pipeline_k=pipeline_k,
                     pipeline_v=pipeline_v,
-                    use_tma_q=use_tma_q,
-                    tma_q_bytes=tma_q_bytes,
                     intra_wg_overlap=intra_wg_overlap,
                 )
@@ -1390,18 +1348,18 @@ def _store_one_dQaccum_sm90(
     m_block,
     sdQaccum: cute.Tensor,
     gdQaccum: cute.Tensor,
-    num_mma_warp_groups: cutlass.Constexpr,
     num_threads_per_warp_group: cutlass.Constexpr,
     tma_copy_bytes_dQ,
 ):
     """Store dQaccum for a single m_block."""
-    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
-        cute.arch.cp_async_bulk_wait_group(num_mma_warp_groups - 1 - warp_group_idx, read=True)
         cute.arch.barrier_arrive(
             barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
             number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
         )
-    for warp_group_idx in cutlass.range_constexpr(num_mma_warp_groups):
         cute.arch.barrier(
             barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
             number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
@@ -1409,7 +1367,7 @@ def _store_one_dQaccum_sm90(
         with cute.arch.elect_one():
             copy_utils.cpasync_reduce_bulk_add_f32(
                 sdQaccum[None, warp_group_idx].iterator,
-                gdQaccum[None, warp_group_idx, m_block].iterator,
                 tma_copy_bytes_dQ,
             )
         cute.arch.cp_async_bulk_commit_group()
@@ -1425,7 +1383,7 @@ def dQaccum_store_block_sparse_bwd_sm90(
     gdQaccum: cute.Tensor,
     subtile_factor: cutlass.Constexpr,
     m_block_max: int,
-    num_mma_warp_groups: cutlass.Constexpr,
     num_threads_per_warp_group: cutlass.Constexpr,
     tma_copy_bytes_dQ,
 ):
@@ -1454,7 +1412,7 @@ def dQaccum_store_block_sparse_bwd_sm90(
                 m_block,
                 sdQaccum,
                 gdQaccum,
-                num_mma_warp_groups,
                 num_threads_per_warp_group,
                 tma_copy_bytes_dQ,
             )
@@ -1470,7 +1428,7 @@ def dQaccum_store_block_sparse_bwd_sm90(
                     m_block,
                     sdQaccum,
                     gdQaccum,
-                    num_mma_warp_groups,
                     num_threads_per_warp_group,
                     tma_copy_bytes_dQ,
                 )

 def load_block_list(
     block_indices: cute.Tensor,
     block_count,
     first_block_preloaded: cutlass.Constexpr,
     kv_producer_state,
     load_K,
     load_V,
     pipeline_k,
     pipeline_v,
     intra_wg_overlap: cutlass.Constexpr,
 ):
+    """Iterate over the sparse blocks and load K, V into the pipeline.
+    For the intra_wg_overlap case, we overlap the loads of K and V. And this
     means we need to pipeline the last V load from the partial block case,
     with the loads for the full blocks. Set first_block_preloaded when the
     caller has already issued the first K load for the list.
+    Q is loaded separately on its own mbarrier before this function is called.
     Note:
         we iterate along the block_n indices in reverse.
     """
     if block_count > 0:
         if const_expr(not intra_wg_overlap):
+            for offset in cutlass.range(block_count):
                 n_block = block_indices[block_count - 1 - offset]
                 pipeline_k.producer_acquire(kv_producer_state)
                 load_K(src_idx=n_block, producer_state=kv_producer_state)
         else:
             n_block_first = block_indices[block_count - 1]
             if const_expr(not first_block_preloaded):
+                pipeline_k.producer_acquire(kv_producer_state)
                 load_K(src_idx=n_block_first, producer_state=kv_producer_state)
             for idx in cutlass.range(block_count - 1, unroll=1):
     head_idx,
     m_block,
     kv_producer_state,
     load_K,
     load_V,
     pipeline_k,
     pipeline_v,
     intra_wg_overlap: cutlass.Constexpr,
     qhead_per_kvhead: cutlass.Constexpr[int] = 1,
     q_subtile_factor: cutlass.Constexpr[int] = 1,
 ):
     """Iterate over the mask and full block lists for a single tile.
+    Q is loaded separately on its own mbarrier before this function is called.
     The masked (partial) list may leave the last V load pending when intra-warp-group
     overlap is enabled. The first full block must consume that pending V while
     issuing its own K load on the next pipeline stage.
     full_empty = curr_full_block_cnt == 0
     if mask_empty:
+        # No masked blocks: the full list owns the initial K load.
         kv_producer_state = load_block_list(
             curr_full_block_idx,
             curr_full_block_cnt,
             first_block_preloaded=False,
             kv_producer_state=kv_producer_state,
             load_K=load_K,
             load_V=load_V,
             pipeline_k=pipeline_k,
             pipeline_v=pipeline_v,
             intra_wg_overlap=intra_wg_overlap,
         )
                 kv_producer_state,
             )
     else:
+        # Masked blocks present. When overlap is disabled this fully drains the list.
         kv_producer_state = load_block_list(
             curr_mask_block_idx,
             curr_mask_block_cnt,
             first_block_preloaded=False,
             kv_producer_state=kv_producer_state,
             load_K=load_K,
             load_V=load_V,
             pipeline_k=pipeline_k,
             pipeline_v=pipeline_v,
             intra_wg_overlap=intra_wg_overlap,
         )
                 kv_producer_state = load_block_list(
                     curr_full_block_idx,
                     curr_full_block_cnt,
                     first_block_preloaded=True,
                     kv_producer_state=kv_producer_state,
                     load_K=load_K,
                     load_V=load_V,
                     pipeline_k=pipeline_k,
                     pipeline_v=pipeline_v,
                     intra_wg_overlap=intra_wg_overlap,
                 )
                     kv_producer_state,
                 )
             else:
+                # Non-overlap path with both lists: run the full list normally.
                 kv_producer_state = load_block_list(
                     curr_full_block_idx,
                     curr_full_block_cnt,
                     first_block_preloaded=False,
                     kv_producer_state=kv_producer_state,
                     load_K=load_K,
                     load_V=load_V,
                     pipeline_k=pipeline_k,
                     pipeline_v=pipeline_v,
                     intra_wg_overlap=intra_wg_overlap,
                 )
     m_block,
     sdQaccum: cute.Tensor,
     gdQaccum: cute.Tensor,
+    num_dQ_warp_groups: cutlass.Constexpr,
     num_threads_per_warp_group: cutlass.Constexpr,
     tma_copy_bytes_dQ,
 ):
     """Store dQaccum for a single m_block."""
+    for warp_group_idx in cutlass.range_constexpr(num_dQ_warp_groups):
+        cute.arch.cp_async_bulk_wait_group(num_dQ_warp_groups - 1 - warp_group_idx, read=True)
         cute.arch.barrier_arrive(
             barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
             number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
         )
+    for warp_group_idx in cutlass.range_constexpr(num_dQ_warp_groups):
         cute.arch.barrier(
             barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
             number_of_threads=num_threads_per_warp_group + cute.arch.WARP_SIZE,
         with cute.arch.elect_one():
             copy_utils.cpasync_reduce_bulk_add_f32(
                 sdQaccum[None, warp_group_idx].iterator,
+                gdQaccum[(None, warp_group_idx), m_block].iterator,
                 tma_copy_bytes_dQ,
             )
         cute.arch.cp_async_bulk_commit_group()
     gdQaccum: cute.Tensor,
     subtile_factor: cutlass.Constexpr,
     m_block_max: int,
+    num_dQ_warp_groups: cutlass.Constexpr,
     num_threads_per_warp_group: cutlass.Constexpr,
     tma_copy_bytes_dQ,
 ):
                 m_block,
                 sdQaccum,
                 gdQaccum,
+                num_dQ_warp_groups,
                 num_threads_per_warp_group,
                 tma_copy_bytes_dQ,
             )
                     m_block,
                     sdQaccum,
                     gdQaccum,
+                    num_dQ_warp_groups,
                     num_threads_per_warp_group,
                     tma_copy_bytes_dQ,
                 )

build/torch-cuda/block_sparsity.py CHANGED Viewed

@@ -34,6 +34,23 @@ class BlockSparseTensorsTorch(NamedTuple):
     block_size: tuple[int, int] | None = None
 def _expand_sparsity_tensor(
     tensor: torch.Tensor,
     expected_shape: Tuple[int, ...],
@@ -81,6 +98,12 @@ def _check_and_expand_block(
     expanded_cnt = _expand_sparsity_tensor(
         cnt, expected_count_shape, f"{name}_block_cnt", context, hint
     )
     expanded_idx = _expand_sparsity_tensor(
         idx, expected_index_shape, f"{name}_block_idx", context, hint
     )
@@ -140,17 +163,14 @@ def infer_block_sparse_expected_shapes(
     num_m_blocks = tensors.mask_block_idx.shape[2]
     if sparse_block_size_q is None:
-        min_block_size = ceildiv(seqlen_q, num_m_blocks)
-        if num_m_blocks == 1:
-            max_block_size = seqlen_q
-        else:
-            max_block_size = (seqlen_q - 1) // (num_m_blocks - 1)
-        if max_block_size != min_block_size and base_m_block != 1:
             raise ValueError(
                 f"Block sparse tensors{context} require explicit sparse_block_size[0] "
                 f"to disambiguate block size for seqlen_q={seqlen_q} and num_m_blocks={num_m_blocks}."
             )
-        sparse_block_size_q = min_block_size
     if sparse_block_size_q % base_m_block != 0:
         raise ValueError(
@@ -186,9 +206,11 @@ def infer_block_sparse_expected_shapes(
             raise ValueError(f"Block sparse tensors{context} {dim_name} dim must be {tgt} or 1.")
     if mask_block_cnt.shape[2] != mask_block_idx.shape[2]:
         raise ValueError(f"Block sparse tensors{context} must share the same m-block dimension.")
-    if mask_block_idx.shape[3] != expected_n_blocks:
         raise ValueError(
-            f"Block sparse tensors{context} n-block dimension must be {expected_n_blocks}."
         )
     if expected_m_blocks != num_m_blocks:
         raise ValueError(
@@ -314,7 +336,7 @@ def normalize_block_sparse_config(
 ) -> tuple[BlockSparseTensorsTorch, Tuple[Tuple[bool, ...], ...] | None, int]:
     m_block_size, n_block_size = block_size
     if tensors.block_size is None:
-        sparse_block_size_q, sparse_block_size_kv = q_stage * m_block_size, n_block_size
     else:
         sparse_block_size_q, sparse_block_size_kv = tensors.block_size
     if sparse_block_size_kv != n_block_size:
@@ -401,6 +423,7 @@ def to_cute_block_sparse_tensors(
     """Convert torch block sparsity tensors to CuTe tensors, optionally for tvm ffi"""
     if not is_block_sparsity_enabled(tensors):
         return None
     (
         mask_block_cnt,
         mask_block_idx,

     block_size: tuple[int, int] | None = None
+def get_sparse_q_block_size(
+    tensors: BlockSparseTensorsTorch | None,
+    seqlen_q: int,
+) -> int | None:
+    """Return the Q sparse block size, or None when sparsity is unset or ambiguous."""
+    if tensors is None:
+        return None
+    if tensors.block_size is not None:
+        return tensors.block_size[0]
+    num_m_blocks = tensors.mask_block_idx.shape[2]
+    min_block_size = ceildiv(seqlen_q, num_m_blocks)
+    max_block_size = seqlen_q if num_m_blocks == 1 else (seqlen_q - 1) // (num_m_blocks - 1)
+    if min_block_size != max_block_size:
+        return None
+    return min_block_size
 def _expand_sparsity_tensor(
     tensor: torch.Tensor,
     expected_shape: Tuple[int, ...],
     expanded_cnt = _expand_sparsity_tensor(
         cnt, expected_count_shape, f"{name}_block_cnt", context, hint
     )
+    # [Note] Allow Compact block sparse indices
+    # Allow the last dimension (n_blocks) of idx to be <= expected, since
+    # FA4 only accesses indices 0..cnt-1 per query tile. This enables compact
+    # index tensors that avoid O(N^2) memory at long sequence lengths.
+    if idx.ndim == 4 and idx.shape[3] <= expected_index_shape[3]:
+        expected_index_shape = (*expected_index_shape[:3], idx.shape[3])
     expanded_idx = _expand_sparsity_tensor(
         idx, expected_index_shape, f"{name}_block_idx", context, hint
     )
     num_m_blocks = tensors.mask_block_idx.shape[2]
     if sparse_block_size_q is None:
+        sparse_block_size_q = get_sparse_q_block_size(tensors, seqlen_q)
+        if sparse_block_size_q is None and base_m_block != 1:
             raise ValueError(
                 f"Block sparse tensors{context} require explicit sparse_block_size[0] "
                 f"to disambiguate block size for seqlen_q={seqlen_q} and num_m_blocks={num_m_blocks}."
             )
+        if sparse_block_size_q is None:
+            sparse_block_size_q = ceildiv(seqlen_q, num_m_blocks)
     if sparse_block_size_q % base_m_block != 0:
         raise ValueError(
             raise ValueError(f"Block sparse tensors{context} {dim_name} dim must be {tgt} or 1.")
     if mask_block_cnt.shape[2] != mask_block_idx.shape[2]:
         raise ValueError(f"Block sparse tensors{context} must share the same m-block dimension.")
+    # [Note] Allow Compact block sparse indices: FA4 only accesses indices 0..cnt-1
+    # per query tile, so idx.shape[3] can be <= expected_n_blocks.
+    if mask_block_idx.shape[3] > expected_n_blocks:
         raise ValueError(
+            f"Block sparse tensors{context} n-block dimension must be <= {expected_n_blocks}."
         )
     if expected_m_blocks != num_m_blocks:
         raise ValueError(
 ) -> tuple[BlockSparseTensorsTorch, Tuple[Tuple[bool, ...], ...] | None, int]:
     m_block_size, n_block_size = block_size
     if tensors.block_size is None:
+        sparse_block_size_q, sparse_block_size_kv = None, n_block_size
     else:
         sparse_block_size_q, sparse_block_size_kv = tensors.block_size
     if sparse_block_size_kv != n_block_size:
     """Convert torch block sparsity tensors to CuTe tensors, optionally for tvm ffi"""
     if not is_block_sparsity_enabled(tensors):
         return None
     (
         mask_block_cnt,
         mask_block_idx,

build/torch-cuda/cache_utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Manage Ahead-of-Time (AOT) compiled kernels
 import fcntl
 import hashlib
-import logging
 import os
 import pickle
 import sys
@@ -18,6 +17,7 @@ import cutlass
 import cutlass.cute as cute
 import tvm_ffi
 from cutlass.cutlass_dsl import JitCompiledFunction
 # Pre-load cute DSL runtime libraries with RTLD_GLOBAL so that their symbols
 # (e.g. _cudaLibraryLoadData) are visible to .so modules loaded later via dlopen.
@@ -30,12 +30,6 @@ for _lib_path in cute.runtime.find_runtime_libraries(enable_tvm_ffi=False):
 CompileKeyType: TypeAlias = tuple[Hashable, ...]
 CallableFunction: TypeAlias = JitCompiledFunction | tvm_ffi.Function
-logger = logging.getLogger(__name__)
-_handler = logging.StreamHandler()
-_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
-logger.addHandler(_handler)
-logger.setLevel(logging.DEBUG)
 # Enable cache via `FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED=1`
 CUTE_DSL_CACHE_ENABLED: bool = os.getenv("FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED", "0") == "1"
@@ -222,13 +216,13 @@ class JITPersistentCache(JITCache):
             label=sha256_hex,
         ):
             if obj_path.exists():
-                logger.debug("Loading compiled function from disk: %s", obj_path)
                 m = cute.runtime.load_module(str(obj_path), enable_tvm_ffi=True)
                 fn = getattr(m, self.EXPORT_FUNCTION_PREFIX)
                 JITCache.__setitem__(self, key, fn)
                 return True
             else:
-                logger.debug("Cache miss on disk for key hash %s", sha256_hex)
         return False
     def _try_export_to_storage(self, key: CompileKeyType, fn: JitCompiledFunction) -> None:
@@ -243,14 +237,14 @@ class JITPersistentCache(JITCache):
             obj_path = self.cache_path / f"{sha256_hex}.o"
             if obj_path.exists():
                 # Another process already exported.
-                logger.debug("Skipping export, already on disk: %s", obj_path)
                 return
-            logger.debug("Exporting compiled function to disk: %s", obj_path)
             fn.export_to_c(
                 object_file_path=str(obj_path),
                 function_name=self.EXPORT_FUNCTION_PREFIX,
             )
-            logger.debug("Successfully exported compiled function to disk: %s", obj_path)
     def _key_to_hash(self, key: CompileKeyType) -> str:
         return hashlib.sha256(pickle.dumps(key)).hexdigest()
@@ -262,7 +256,7 @@ class JITPersistentCache(JITCache):
         """
         Not only clear the in-memory cache. Also purge persistent compilation cache.
         """
-        logger.debug("Clearing persistent cache at %s", self.cache_path)
         super().clear()
         for child in self.cache_path.iterdir():
             child.unlink()
@@ -281,8 +275,8 @@ def get_jit_cache(name: str | None = None) -> JITCache:
         path = get_cache_path() / _compute_source_fingerprint()
         if name:
             path = path / name
-        logger.debug("Creating persistent JIT cache at %s", path)
         return JITPersistentCache(path)
     else:
-        logger.debug("Persistent cache disabled, using in-memory JIT cache")
         return JITCache()

 # Manage Ahead-of-Time (AOT) compiled kernels
 import fcntl
 import hashlib
 import os
 import pickle
 import sys
 import cutlass.cute as cute
 import tvm_ffi
 from cutlass.cutlass_dsl import JitCompiledFunction
+from .fa_logging import fa_log
 # Pre-load cute DSL runtime libraries with RTLD_GLOBAL so that their symbols
 # (e.g. _cudaLibraryLoadData) are visible to .so modules loaded later via dlopen.
 CompileKeyType: TypeAlias = tuple[Hashable, ...]
 CallableFunction: TypeAlias = JitCompiledFunction | tvm_ffi.Function
 # Enable cache via `FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED=1`
 CUTE_DSL_CACHE_ENABLED: bool = os.getenv("FLASH_ATTENTION_CUTE_DSL_CACHE_ENABLED", "0") == "1"
             label=sha256_hex,
         ):
             if obj_path.exists():
+                fa_log(1, f"Loading compiled function from disk: {obj_path}")
                 m = cute.runtime.load_module(str(obj_path), enable_tvm_ffi=True)
                 fn = getattr(m, self.EXPORT_FUNCTION_PREFIX)
                 JITCache.__setitem__(self, key, fn)
                 return True
             else:
+                fa_log(1, f"Cache miss on disk for key hash {sha256_hex}")
         return False
     def _try_export_to_storage(self, key: CompileKeyType, fn: JitCompiledFunction) -> None:
             obj_path = self.cache_path / f"{sha256_hex}.o"
             if obj_path.exists():
                 # Another process already exported.
+                fa_log(1, f"Skipping export, already on disk: {obj_path}")
                 return
+            fa_log(1, f"Exporting compiled function to disk: {obj_path}")
             fn.export_to_c(
                 object_file_path=str(obj_path),
                 function_name=self.EXPORT_FUNCTION_PREFIX,
             )
+            fa_log(1, f"Successfully exported compiled function to disk: {obj_path}")
     def _key_to_hash(self, key: CompileKeyType) -> str:
         return hashlib.sha256(pickle.dumps(key)).hexdigest()
         """
         Not only clear the in-memory cache. Also purge persistent compilation cache.
         """
+        fa_log(1, f"Clearing persistent cache at {self.cache_path}")
         super().clear()
         for child in self.cache_path.iterdir():
             child.unlink()
         path = get_cache_path() / _compute_source_fingerprint()
         if name:
             path = path / name
+        fa_log(1, f"Creating persistent JIT cache at {path}")
         return JITPersistentCache(path)
     else:
+        fa_log(1, "Persistent cache disabled, using in-memory JIT cache")
         return JITCache()

build/torch-cuda/cute_dsl_utils.py CHANGED Viewed

@@ -4,7 +4,6 @@ import os
 import pathlib
 from typing import Tuple
 from functools import partial, lru_cache
-from dataclasses import dataclass, fields
 import torch
@@ -15,7 +14,6 @@ except ImportError:
 import cutlass
 import cutlass.cute as cute
-from cutlass.base_dsl.typing import JitArgument
 from cutlass.cutlass_dsl import NumericMeta
 from cutlass.cute.runtime import from_dlpack
@@ -43,42 +41,6 @@ def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
     return torch.cuda.get_device_capability(device)
-@dataclass
-class ArgumentsBase(JitArgument):
-    def __c_pointers__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
-        c_ptrs = []
-        for obj in non_constexpr_fields:
-            if hasattr(obj, "__c_pointers__"):
-                c_ptrs.extend(obj.__c_pointers__())
-        return c_ptrs
-    def __get_mlir_types__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
-        types, self._values_pos = [], []
-        for obj in non_constexpr_fields:
-            if hasattr(obj, "__get_mlir_types__"):
-                obj_types = obj.__get_mlir_types__()
-                types.extend(obj_types)
-                self._values_pos.append(len(obj_types))
-            else:
-                self._values_pos.append(0)
-        return types
-    def __new_from_mlir_values__(self, values):
-        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
-        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
-        non_constexpr_fields = {
-            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
-        }
-        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
-            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
-            values = values[n_items:]
-        return self.__class__(**non_constexpr_fields, **constexpr_fields)
 def load_cubin_module_data_patched(cubin_data, filepath):
     pathlib.Path(filepath).write_bytes(cubin_data)
     return load_cubin_module_data_og(cubin_data)

 import pathlib
 from typing import Tuple
 from functools import partial, lru_cache
 import torch
 import cutlass
 import cutlass.cute as cute
 from cutlass.cutlass_dsl import NumericMeta
 from cutlass.cute.runtime import from_dlpack
     return torch.cuda.get_device_capability(device)
 def load_cubin_module_data_patched(cubin_data, filepath):
     pathlib.Path(filepath).write_bytes(cubin_data)
     return load_cubin_module_data_og(cubin_data)

build/torch-cuda/fa_logging.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) 2025, Tri Dao.
+"""Unified FlashAttention logging controlled by a single ``FA_LOG_LEVEL`` env var.
+Host-side messages go through Python ``logging`` (logger name ``flash_attn``).
+A default ``StreamHandler`` is attached automatically when ``FA_LOG_LEVEL >= 1``
+so that standalone scripts get output without extra setup; applications that
+configure their own logging can remove or replace it via the standard API.
+FA_LOG_LEVEL mapping::
+    0  off       nothing logged
+    1  host      host-side summaries only (no kernel printf)
+    2  kernel    host + curated kernel traces
+    3  max       host + all kernel traces (noisy, perf hit)
+Set via environment variable::
+    FA_LOG_LEVEL=1 python train.py
+Device-side ``cute.printf`` calls are compile-time eliminated via
+``cutlass.const_expr`` when the log level is below the callsite threshold,
+so there is zero performance cost when device logging is off.
+Changing the log level after kernel compilation requires a recompile
+(the level participates in the forward compile key).
+"""
+import logging
+import os
+import sys
+import cutlass.cute as cute
+from cutlass import const_expr
+_LOG_LEVEL_NAMES = {"off": 0, "host": 1, "kernel": 2, "max": 3}
+def _parse_log_level(raw: str) -> int:
+    if raw in _LOG_LEVEL_NAMES:
+        return _LOG_LEVEL_NAMES[raw]
+    try:
+        level = int(raw)
+    except ValueError:
+        return 0
+    return max(0, min(level, 3))
+_fa_log_level: int = _parse_log_level(os.environ.get("FA_LOG_LEVEL", "0"))
+_logger = logging.getLogger("flash_attn")
+_logger.addHandler(logging.NullHandler())
+_default_handler: logging.Handler | None = None
+def _configure_default_handler() -> None:
+    global _default_handler
+    if _fa_log_level >= 1:
+        if _default_handler is None:
+            _default_handler = logging.StreamHandler(sys.stdout)
+            _default_handler.setFormatter(logging.Formatter("[FA] %(message)s"))
+            _logger.addHandler(_default_handler)
+        _logger.setLevel(logging.DEBUG)
+    else:
+        if _default_handler is not None:
+            _logger.removeHandler(_default_handler)
+            _default_handler = None
+        _logger.setLevel(logging.WARNING)
+_configure_default_handler()
+def get_fa_log_level() -> int:
+    return _fa_log_level
+def set_fa_log_level(level: int | str) -> None:
+    """Set the FA log level programmatically.
+    Host logging takes effect immediately.  Device logging changes only
+    affect kernels compiled after this call (new compile-key selection).
+    """
+    global _fa_log_level
+    if isinstance(level, str):
+        level = _parse_log_level(level)
+    _fa_log_level = max(0, min(int(level), 3))
+    _configure_default_handler()
+def fa_log(level: int, msg: str):
+    if _fa_log_level >= level:
+        _logger.info(msg)
+def fa_printf(level: int, fmt, *args):
+    if const_expr(_fa_log_level >= level):
+        cute.printf(fmt, *args)

build/torch-cuda/flash_bwd.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .mask import AttentionMask
 from .seqlen_info import SeqlenInfoQK
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import SingleTileScheduler, SingleTileVarlenScheduler, TileSchedulerArguments
 class FlashAttentionBackwardSm80:
@@ -372,7 +373,6 @@ class FlashAttentionBackwardSm80:
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: cutlass.Float32,
-        stream: cuda.CUstream,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
@@ -381,8 +381,16 @@ class FlashAttentionBackwardSm80:
         window_size_left: Int32 | int | None = None,
         window_size_right: Int32 | int | None = None,
         mdQ_semaphore: Optional[cute.Tensor] = None,
     ):
-        assert mdQ_semaphore is None, "semaphore not supported yet"
         # Get the data type and check if it is fp16 or bf16
         self._check_type(*(t.element_type if t is not None else None
                            for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK)))
@@ -512,7 +520,17 @@ class FlashAttentionBackwardSm80:
         n_block, head_idx, batch_idx, _ = work_tile.tile_idx
         if work_tile.is_valid_tile:
-            seqlen = SeqlenInfoQK.create(batch_idx, mQ.shape[1], mK.shape[1], mCuSeqlensQ=mCuSeqlensQ, mCuSeqlensK=mCuSeqlensK, mSeqUsedQ=mSeqUsedQ, mSeqUsedK=mSeqUsedK)
             m_block_max = cute.ceil_div(seqlen.seqlen_q, self.m_block_size)
             m_block_min = 0
@@ -538,7 +556,7 @@ class FlashAttentionBackwardSm80:
                 mdPsum_cur = mdPsum[batch_idx, head_idx, None]
                 mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
             else:
-                padded_offset_q = seqlen.offset_q + batch_idx * self.m_block_size
                 mQ_cur = cute.domain_offset((seqlen.offset_q, 0), mQ[None, head_idx, None])
                 mLSE_cur = cute.domain_offset((padded_offset_q,), mLSE[head_idx, None])
                 mdO_cur = cute.domain_offset((seqlen.offset_q, 0), mdO[None, head_idx, None])
@@ -794,9 +812,10 @@ class FlashAttentionBackwardSm80:
             # Mainloop
             # ///////////////////////////////////////////////////////////////////////////////
             # Start processing of the first n-block.
-            mask = AttentionMask(self.m_block_size, self.n_block_size, seqlen.seqlen_q, seqlen.seqlen_k)
             mask_fn = partial(
                 mask.apply_mask, n_block=n_block, thr_mma=thr_mma_sdp,
                 mask_seqlen=True, mask_causal=self.is_causal
             )
             smem_pipe_read_q = cutlass.Int32(0)
@@ -968,7 +987,7 @@ class FlashAttentionBackwardSm80:
         # MMA dK
         if cutlass.const_expr(self.Mma_dKV_is_RS):
-            tdVrP = layout_utils.reshape_acc_to_frgA(rdS)
         else:
             tdKrdS = mma_params.tdKrdS
         sm80_utils.gemm(

 from .seqlen_info import SeqlenInfoQK
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import SingleTileScheduler, SingleTileVarlenScheduler, TileSchedulerArguments
+from .block_sparsity import BlockSparseTensors
 class FlashAttentionBackwardSm80:
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: cutlass.Float32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         window_size_left: Int32 | int | None = None,
         window_size_right: Int32 | int | None = None,
         mdQ_semaphore: Optional[cute.Tensor] = None,
+        mdK_semaphore: Optional[cute.Tensor] = None,
+        mdV_semaphore: Optional[cute.Tensor] = None,
+        aux_tensors: Optional[list] = None,
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
+        assert mdQ_semaphore is None and mdK_semaphore is None and mdV_semaphore is None, (
+            "determinism not supported yet for Sm80"
+        )
         # Get the data type and check if it is fp16 or bf16
         self._check_type(*(t.element_type if t is not None else None
                            for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK)))
         n_block, head_idx, batch_idx, _ = work_tile.tile_idx
         if work_tile.is_valid_tile:
+            seqlen = SeqlenInfoQK.create(
+                batch_idx,
+                mQ.shape[1],
+                mK.shape[1],
+                mCuSeqlensQ=mCuSeqlensQ,
+                mCuSeqlensK=mCuSeqlensK,
+                mSeqUsedQ=mSeqUsedQ,
+                mSeqUsedK=mSeqUsedK,
+                tile_m=self.m_block_size,
+                tile_n=self.n_block_size,
+            )
             m_block_max = cute.ceil_div(seqlen.seqlen_q, self.m_block_size)
             m_block_min = 0
                 mdPsum_cur = mdPsum[batch_idx, head_idx, None]
                 mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
             else:
+                padded_offset_q = seqlen.padded_offset_q
                 mQ_cur = cute.domain_offset((seqlen.offset_q, 0), mQ[None, head_idx, None])
                 mLSE_cur = cute.domain_offset((padded_offset_q,), mLSE[head_idx, None])
                 mdO_cur = cute.domain_offset((seqlen.offset_q, 0), mdO[None, head_idx, None])
             # Mainloop
             # ///////////////////////////////////////////////////////////////////////////////
             # Start processing of the first n-block.
+            mask = AttentionMask(self.m_block_size, self.n_block_size, seqlen)
             mask_fn = partial(
                 mask.apply_mask, n_block=n_block, thr_mma=thr_mma_sdp,
+                batch_idx=batch_idx, head_idx=head_idx,
                 mask_seqlen=True, mask_causal=self.is_causal
             )
             smem_pipe_read_q = cutlass.Int32(0)
         # MMA dK
         if cutlass.const_expr(self.Mma_dKV_is_RS):
+            tdKrdS = layout_utils.reshape_acc_to_frgA(rdS)
         else:
             tdKrdS = mma_params.tdKrdS
         sm80_utils.gemm(

build/torch-cuda/flash_bwd_postprocess.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_postprocess_kernel.h
 # from Cutlass C++ to Cute-DSL.
 import math
-from typing import Callable, Optional, Type, Literal
 import cuda.bindings.driver as cuda
@@ -36,7 +36,7 @@ class FlashAttentionBackwardPostprocess:
         self,
         dtype: Type[cutlass.Numeric],
         head_dim: int,
-        arch: Literal[80, 90, 100],
         tile_m: int = 128,
         num_threads: int = 256,
         AtomLayoutMdQ: int = 1,
@@ -52,8 +52,8 @@ class FlashAttentionBackwardPostprocess:
         """
         self.dtype = dtype
         self.tile_m = tile_m
-        assert arch // 10 in [8, 9, 10, 11], (
-            "Only Ampere (8.x), Hopper (9.x), and Blackwell (10.x, 11.x) are supported"
         )
         self.arch = arch
         # padding head_dim to a multiple of 32 as k_block_size
@@ -63,7 +63,7 @@ class FlashAttentionBackwardPostprocess:
         self.num_threads = num_threads
         self.AtomLayoutMdQ = AtomLayoutMdQ
         self.dQ_swapAB = dQ_swapAB
-        self.use_2cta_instrs = use_2cta_instrs and arch == 100 and head_dim != 64
         self.cluster_size = cluster_size
     @staticmethod
@@ -89,7 +89,7 @@ class FlashAttentionBackwardPostprocess:
         return True
     def _get_tiled_mma(self):
-        if const_expr(self.arch == 80):
             num_mma_warps = self.num_threads // 32
             atom_layout_dQ = (
                 (self.AtomLayoutMdQ, num_mma_warps // self.AtomLayoutMdQ, 1)
@@ -101,9 +101,9 @@ class FlashAttentionBackwardPostprocess:
                 atom_layout_dQ,
                 permutation_mnk=(atom_layout_dQ[0] * 16, atom_layout_dQ[1] * 16, 16),
             )
-        elif const_expr(self.arch == 90):
-            num_mma_warp_groups = self.num_threads // 128
-            atom_layout_dQ = (self.AtomLayoutMdQ, num_mma_warp_groups // self.AtomLayoutMdQ)
             tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
             tiled_mma = sm90_utils_basic.make_trivial_tiled_mma(
                 self.dtype,
@@ -125,7 +125,7 @@ class FlashAttentionBackwardPostprocess:
                 cta_group,
                 (self.tile_m, self.tile_hdim),
             )
-        if const_expr(self.arch in [80, 90]):
             assert self.num_threads == tiled_mma.size
         return tiled_mma
@@ -148,22 +148,22 @@ class FlashAttentionBackwardPostprocess:
             cute.make_layout(self.num_threads),
             cute.make_layout(async_copy_elems_accum),
         )
-        num_s2r_copy_elems = 1 if const_expr(self.arch == 80) else 4
-        if const_expr(self.arch == 80):
             self.s2r_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
                 Float32, self.num_threads, num_s2r_copy_elems
             )
             self.sdQaccum_layout = cute.make_layout(self.tile_m * self.tile_hdim)
-        elif const_expr(self.arch == 90):
             num_threads_per_warp_group = 128
-            num_mma_warp_groups = self.num_threads // 128
             self.s2r_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
                 cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
-                cute.make_layout((num_threads_per_warp_group, num_mma_warp_groups)),  # thr_layout
                 cute.make_layout(128 // Float32.width),  # val_layout
             )
             self.sdQaccum_layout = cute.make_layout(
-                (self.tile_m * self.tile_hdim // num_mma_warp_groups, num_mma_warp_groups)
             )
         else:
             self.dQ_reduce_ncol = 32
@@ -188,14 +188,18 @@ class FlashAttentionBackwardPostprocess:
         # then setting kBlockKSmem to 32 will cause "Static shape_div failure".
         # We want to treat it as 64 x 48, so kBlockKSmem should be 16.
         mma_shape_n = self.tiled_mma.get_tile_size(1)
-        if const_expr(self.arch == 80):
             sdQ_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, mma_shape_n)
             self.sdQ_layout = cute.tile_to_shape(
                 sdQ_layout_atom, (self.tile_m, self.tile_hdim), (0, 1)
             )
-        elif const_expr(self.arch == 90):
             self.sdQ_layout = sm90_utils.make_smem_layout(
-                self.dtype, LayoutEnum.ROW_MAJOR, (self.tile_m, self.tile_hdim)
             )
         else:
             # TODO: this is hard-coded for hdim 128
@@ -211,7 +215,8 @@ class FlashAttentionBackwardPostprocess:
         scale: cutlass.Float32,
         mCuSeqlensQ: Optional[cute.Tensor],
         mSeqUsedQ: Optional[cute.Tensor],
-        stream: cuda.CUstream,
     ):
         # Get the data type and check if it is fp16 or bf16
         if const_expr(mdQ.element_type not in [cutlass.Float16, cutlass.BFloat16]):
@@ -305,7 +310,7 @@ class FlashAttentionBackwardPostprocess:
         smem = cutlass.utils.SmemAllocator()
         sdQaccum = smem.allocate_tensor(cutlass.Float32, sdQaccum_layout, byte_alignment=1024)
         sdQaccum_flat = cute.make_tensor(sdQaccum.iterator, cute.make_layout(cute.size(sdQaccum)))
-        if const_expr(self.arch in [80, 90]):
             sdQ = cute.make_tensor(cute.recast_ptr(sdQaccum.iterator, dtype=self.dtype), sdQ_layout)
         else:
             # extra stage dimension
@@ -343,10 +348,7 @@ class FlashAttentionBackwardPostprocess:
                 mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
                 head_dim = mdQ.shape[3]
             else:
-                if cutlass.const_expr(self.arch >= 90):
-                    padded_offset_q = seqlen.padded_offset_q
-                else:
-                    padded_offset_q = seqlen.offset_q + batch_idx * self.tile_m
                 mdQ_cur = cute.domain_offset((seqlen.offset_q, 0), mdQ[None, head_idx, None])
                 mdQaccum_cur = cute.domain_offset(
                     (padded_offset_q * self.tile_hdim,), mdQaccum[head_idx, None]
@@ -371,7 +373,7 @@ class FlashAttentionBackwardPostprocess:
             seqlen_q = seqlen.seqlen_q
             seqlen_q_rounded = cute.round_up(seqlen_q, self.tile_m)
-            if const_expr(self.arch == 100 and self.use_2cta_instrs):
                 # 2-CTA: remap dQaccum layout into TMEM view before writing sdQ
                 num_reduce_threads = self.num_threads
                 thr_mma_dsk = tiled_mma.get_slice(tidx)
@@ -502,7 +504,7 @@ class FlashAttentionBackwardPostprocess:
                 tile_shape = (self.tile_m, self.tile_hdim)
                 acc = None
                 tiled_copy_t2r = None
-                if const_expr(self.arch in [80, 90]):
                     acc_shape = tiled_mma.partition_shape_C(
                         tile_shape if const_expr(not dQ_swapAB) else tile_shape[::-1]
                     )
@@ -531,7 +533,7 @@ class FlashAttentionBackwardPostprocess:
                 # Step 3: Copy dQ from register to smem
                 cute.arch.barrier()  # make sure all threads have finished loading dQaccum
-                if const_expr(self.arch in [80, 90]):
                     copy_atom_r2s_dQ = utils.get_smem_store_atom(
                         self.arch, self.dtype, transpose=self.dQ_swapAB
                     )
@@ -553,7 +555,7 @@ class FlashAttentionBackwardPostprocess:
                     )
                 thr_copy_r2s_dQ = tiled_copy_r2s_dQ.get_slice(tidx)
                 cdQ = cute.make_identity_tensor((self.tile_m, self.tile_hdim))
-                if const_expr(self.arch in [80, 90]):
                     taccdQrdQ = thr_copy_r2s_dQ.retile(rdQ)
                 else:
                     taccdQcdQ_shape = thr_copy_r2s_dQ.partition_S(cdQ).shape

 # A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_postprocess_kernel.h
 # from Cutlass C++ to Cute-DSL.
 import math
+from typing import Callable, Optional, Type
 import cuda.bindings.driver as cuda
         self,
         dtype: Type[cutlass.Numeric],
         head_dim: int,
+        arch: int,
         tile_m: int = 128,
         num_threads: int = 256,
         AtomLayoutMdQ: int = 1,
         """
         self.dtype = dtype
         self.tile_m = tile_m
+        assert arch // 10 in [8, 9, 10, 11, 12], (
+            "Only Ampere (8.x), Hopper (9.x), and Blackwell (10.x, 11.x, 12.x) are supported"
         )
         self.arch = arch
         # padding head_dim to a multiple of 32 as k_block_size
         self.num_threads = num_threads
         self.AtomLayoutMdQ = AtomLayoutMdQ
         self.dQ_swapAB = dQ_swapAB
+        self.use_2cta_instrs = use_2cta_instrs and arch // 10 == 10 and head_dim != 64
         self.cluster_size = cluster_size
     @staticmethod
         return True
     def _get_tiled_mma(self):
+        if const_expr(self.arch // 10 in [8, 12]):
             num_mma_warps = self.num_threads // 32
             atom_layout_dQ = (
                 (self.AtomLayoutMdQ, num_mma_warps // self.AtomLayoutMdQ, 1)
                 atom_layout_dQ,
                 permutation_mnk=(atom_layout_dQ[0] * 16, atom_layout_dQ[1] * 16, 16),
             )
+        elif const_expr(self.arch // 10 == 9):
+            num_wg_mma = self.num_threads // 128
+            atom_layout_dQ = (self.AtomLayoutMdQ, num_wg_mma // self.AtomLayoutMdQ)
             tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
             tiled_mma = sm90_utils_basic.make_trivial_tiled_mma(
                 self.dtype,
                 cta_group,
                 (self.tile_m, self.tile_hdim),
             )
+        if const_expr(self.arch // 10 in [8, 9, 12]):
             assert self.num_threads == tiled_mma.size
         return tiled_mma
             cute.make_layout(self.num_threads),
             cute.make_layout(async_copy_elems_accum),
         )
+        num_s2r_copy_elems = 1 if const_expr(self.arch // 10 in [8, 12]) else 4
+        if const_expr(self.arch // 10 in [8, 12]):
             self.s2r_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
                 Float32, self.num_threads, num_s2r_copy_elems
             )
             self.sdQaccum_layout = cute.make_layout(self.tile_m * self.tile_hdim)
+        elif const_expr(self.arch // 10 == 9):
             num_threads_per_warp_group = 128
+            num_wg_mma = self.num_threads // 128
             self.s2r_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
                 cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
+                cute.make_layout((num_threads_per_warp_group, num_wg_mma)),  # thr_layout
                 cute.make_layout(128 // Float32.width),  # val_layout
             )
             self.sdQaccum_layout = cute.make_layout(
+                (self.tile_m * self.tile_hdim // num_wg_mma, num_wg_mma)
             )
         else:
             self.dQ_reduce_ncol = 32
         # then setting kBlockKSmem to 32 will cause "Static shape_div failure".
         # We want to treat it as 64 x 48, so kBlockKSmem should be 16.
         mma_shape_n = self.tiled_mma.get_tile_size(1)
+        if const_expr(self.arch // 10 in [8, 12]):
             sdQ_layout_atom = sm80_utils.get_smem_layout_atom(self.dtype, mma_shape_n)
             self.sdQ_layout = cute.tile_to_shape(
                 sdQ_layout_atom, (self.tile_m, self.tile_hdim), (0, 1)
             )
+        elif const_expr(self.arch // 10 == 9):
+            wg_d_dQ = num_wg_mma // self.AtomLayoutMdQ
             self.sdQ_layout = sm90_utils.make_smem_layout(
+                self.dtype,
+                LayoutEnum.ROW_MAJOR,
+                (self.tile_m, self.tile_hdim),
+                major_mode_size=self.tile_hdim // wg_d_dQ,
             )
         else:
             # TODO: this is hard-coded for hdim 128
         scale: cutlass.Float32,
         mCuSeqlensQ: Optional[cute.Tensor],
         mSeqUsedQ: Optional[cute.Tensor],
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         # Get the data type and check if it is fp16 or bf16
         if const_expr(mdQ.element_type not in [cutlass.Float16, cutlass.BFloat16]):
         smem = cutlass.utils.SmemAllocator()
         sdQaccum = smem.allocate_tensor(cutlass.Float32, sdQaccum_layout, byte_alignment=1024)
         sdQaccum_flat = cute.make_tensor(sdQaccum.iterator, cute.make_layout(cute.size(sdQaccum)))
+        if const_expr(self.arch // 10 in [8, 9, 12]):
             sdQ = cute.make_tensor(cute.recast_ptr(sdQaccum.iterator, dtype=self.dtype), sdQ_layout)
         else:
             # extra stage dimension
                 mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
                 head_dim = mdQ.shape[3]
             else:
+                padded_offset_q = seqlen.padded_offset_q
                 mdQ_cur = cute.domain_offset((seqlen.offset_q, 0), mdQ[None, head_idx, None])
                 mdQaccum_cur = cute.domain_offset(
                     (padded_offset_q * self.tile_hdim,), mdQaccum[head_idx, None]
             seqlen_q = seqlen.seqlen_q
             seqlen_q_rounded = cute.round_up(seqlen_q, self.tile_m)
+            if const_expr(self.arch // 10 == 10 and self.use_2cta_instrs):
                 # 2-CTA: remap dQaccum layout into TMEM view before writing sdQ
                 num_reduce_threads = self.num_threads
                 thr_mma_dsk = tiled_mma.get_slice(tidx)
                 tile_shape = (self.tile_m, self.tile_hdim)
                 acc = None
                 tiled_copy_t2r = None
+                if const_expr(self.arch // 10 in [8, 9, 12]):
                     acc_shape = tiled_mma.partition_shape_C(
                         tile_shape if const_expr(not dQ_swapAB) else tile_shape[::-1]
                     )
                 # Step 3: Copy dQ from register to smem
                 cute.arch.barrier()  # make sure all threads have finished loading dQaccum
+                if const_expr(self.arch // 10 in [8, 9, 12]):
                     copy_atom_r2s_dQ = utils.get_smem_store_atom(
                         self.arch, self.dtype, transpose=self.dQ_swapAB
                     )
                     )
                 thr_copy_r2s_dQ = tiled_copy_r2s_dQ.get_slice(tidx)
                 cdQ = cute.make_identity_tensor((self.tile_m, self.tile_hdim))
+                if const_expr(self.arch // 10 in [8, 9, 12]):
                     taccdQrdQ = thr_copy_r2s_dQ.retile(rdQ)
                 else:
                     taccdQcdQ_shape = thr_copy_r2s_dQ.partition_S(cdQ).shape

build/torch-cuda/flash_bwd_preprocess.py CHANGED Viewed

@@ -1,21 +1,32 @@
 # Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
 # A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_preprocess_kernel.h
 # from Cutlass C++ to Cute-DSL.
 import math
 import operator
-from typing import Callable, Type, Optional, Literal
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32
-from .quack import copy_utils
 from . import utils
-from .cute_dsl_utils import assume_tensor_aligned
-from .seqlen_info import SeqlenInfoQK
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import (
     SingleTileScheduler,
@@ -30,9 +41,8 @@ class FlashAttentionBackwardPreprocess:
         dtype: Type[cutlass.Numeric],
         head_dim: int,
         head_dim_v: int,
-        arch: Literal[80, 90, 100],
-        m_block_size: int = 128,
-        num_threads: int = 128,
     ):
         """
         All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
@@ -40,14 +50,14 @@ class FlashAttentionBackwardPreprocess:
         :param head_dim: head dimension
         :type head_dim: int
-        :param m_block_size: m block size
-        :type m_block_size: int
         :param num_threads: number of threads
         :type num_threads: int
         """
         self.dtype = dtype
-        self.m_block_size = m_block_size
-        self.arch = arch
         # padding head_dim to a multiple of 32 as k_block_size
         hdim_multiple_of = 32
         self.head_dim_padded = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
@@ -56,15 +66,15 @@ class FlashAttentionBackwardPreprocess:
         self.num_threads = num_threads
     @staticmethod
-    def can_implement(dtype, head_dim, m_block_size, num_threads) -> bool:
         """Check if the kernel can be implemented with the given parameters.
         :param dtype: data type
         :type dtype: cutlass.Numeric
         :param head_dim: head dimension
         :type head_dim: int
-        :param m_block_size: m block size
-        :type m_block_size: int
         :param num_threads: number of threads
         :type num_threads: int
@@ -77,7 +87,7 @@ class FlashAttentionBackwardPreprocess:
             return False
         if num_threads % 32 != 0:
             return False
-        if num_threads < m_block_size:  # For multiplying lse with log2
             return False
         return True
@@ -105,7 +115,7 @@ class FlashAttentionBackwardPreprocess:
         universal_copy_bits = 128
         num_copy_elems_dQaccum = universal_copy_bits // Float32.width
         assert (
-            self.m_block_size * self.head_dim_padded // num_copy_elems_dQaccum
         ) % self.num_threads == 0
         self.gmem_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
             Float32, self.num_threads, num_copy_elems_dQaccum
@@ -114,38 +124,53 @@ class FlashAttentionBackwardPreprocess:
     @cute.jit
     def __call__(
         self,
-        mO: cute.Tensor,
-        mdO: cute.Tensor,
-        mdPsum: cute.Tensor,
-        mLSE: Optional[cute.Tensor],
-        mLSElog2: Optional[cute.Tensor],
         mdQaccum: Optional[cute.Tensor],
-        mCuSeqlensQ: Optional[cute.Tensor],
-        mSeqUsedQ: Optional[cute.Tensor],
-        stream: cuda.CUstream,
     ):
         # Get the data type and check if it is fp16 or bf16
-        if cutlass.const_expr(not (mO.element_type == mdO.element_type)):
             raise TypeError("All tensors must have the same data type")
-        if cutlass.const_expr(mO.element_type not in [cutlass.Float16, cutlass.BFloat16]):
             raise TypeError("Only Float16 or BFloat16 is supported")
-        if cutlass.const_expr(mdPsum.element_type not in [Float32]):
-            raise TypeError("dPsum tensor must be Float32")
-        if cutlass.const_expr(mdQaccum is not None):
-            if cutlass.const_expr(mdQaccum.element_type not in [Float32]):
                 raise TypeError("dQaccum tensor must be Float32")
-        if cutlass.const_expr(mLSE is not None):
             assert mLSElog2 is not None, "If mLSE is provided, mLSElog2 must also be provided"
-            if cutlass.const_expr(mLSE.element_type not in [Float32]):
                 raise TypeError("LSE tensor must be Float32")
-            if cutlass.const_expr(mLSElog2.element_type not in [Float32]):
                 raise TypeError("LSElog2 tensor must be Float32")
-        mO, mdO, mdQaccum = [assume_tensor_aligned(t) for t in (mO, mdO, mdQaccum)]
         self._setup_attributes()
-        if cutlass.const_expr(mCuSeqlensQ is not None):
             TileScheduler = SingleTileVarlenScheduler
             num_head = mO.shape[1]
             num_batch = mCuSeqlensQ.shape[0] - 1
@@ -155,7 +180,7 @@ class FlashAttentionBackwardPreprocess:
             num_batch = mO.shape[0]
         tile_sched_args = TileSchedulerArguments(
-            num_block=cute.ceil_div(mO.shape[1], self.m_block_size),
             num_head=num_head,
             num_batch=num_batch,
             num_splits=1,
@@ -163,7 +188,7 @@ class FlashAttentionBackwardPreprocess:
             headdim=0,
             headdim_v=mO.shape[2],
             total_q=mO.shape[0],
-            tile_shape_mn=(self.m_block_size, 1),
             mCuSeqlensQ=mCuSeqlensQ,
             mSeqUsedQ=mSeqUsedQ,
         )
@@ -174,12 +199,13 @@ class FlashAttentionBackwardPreprocess:
         self.kernel(
             mO,
             mdO,
-            mdPsum,
             mLSE,
             mLSElog2,
             mdQaccum,
             mCuSeqlensQ,
             mSeqUsedQ,
             self.gmem_tiled_copy_O,
             self.gmem_tiled_copy_dQaccum,
             tile_sched_params,
@@ -188,6 +214,7 @@ class FlashAttentionBackwardPreprocess:
             grid=grid_dim,
             block=[self.num_threads, 1, 1],
             stream=stream,
         )
     @cute.kernel
@@ -195,12 +222,13 @@ class FlashAttentionBackwardPreprocess:
         self,
         mO: cute.Tensor,
         mdO: cute.Tensor,
-        mdPsum: cute.Tensor,
         mLSE: Optional[cute.Tensor],
         mLSElog2: Optional[cute.Tensor],
         mdQaccum: Optional[cute.Tensor],
         mCuSeqlensQ: Optional[cute.Tensor],
         mSeqUsedQ: Optional[cute.Tensor],
         gmem_tiled_copy_O: cute.TiledCopy,
         gmem_tiled_copy_dQaccum: cute.TiledCopy,
         tile_sched_params: ParamsBase,
@@ -217,145 +245,106 @@ class FlashAttentionBackwardPreprocess:
             # ///////////////////////////////////////////////////////////////////////////////
             # Get the appropriate tiles for this thread block.
             # ///////////////////////////////////////////////////////////////////////////////
-            seqlen = SeqlenInfoQK.create(
-                batch_idx,
-                mO.shape[1],
-                0,
-                mCuSeqlensQ=mCuSeqlensQ,
-                mCuSeqlensK=None,
-                mSeqUsedQ=mSeqUsedQ,
-                mSeqUsedK=None,
             )
-            if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
-                mO_cur = mO[batch_idx, None, head_idx, None]
-                mdO_cur = mdO[batch_idx, None, head_idx, None]
-                mdPsum_cur = mdPsum[batch_idx, head_idx, None]
-                headdim_v = mO.shape[3]
-            else:
-                mO_cur = cute.domain_offset((seqlen.offset_q, 0), mO[None, head_idx, None])
-                mdO_cur = cute.domain_offset((seqlen.offset_q, 0), mdO[None, head_idx, None])
-                padded_offset_q = seqlen.offset_q + batch_idx * self.m_block_size
-                if cutlass.const_expr(self.arch >= 90):
-                    padded_offset_q = padded_offset_q // self.m_block_size * self.m_block_size
-                mdPsum_cur = cute.domain_offset((padded_offset_q,), mdPsum[head_idx, None])
-                headdim_v = mO.shape[2]
-            blkOdO_shape = (self.m_block_size, self.head_dim_v_padded)
-            # (m_block_size, head_dim_v)
-            gO = cute.local_tile(mO_cur, blkOdO_shape, (m_block, 0))
-            gdO = cute.local_tile(mdO_cur, blkOdO_shape, (m_block, 0))
             gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
             # (CPY_Atom, CPY_M, CPY_K)
             tOgO = gmem_thr_copy_O.partition_S(gO)
             tOgdO = gmem_thr_copy_O.partition_S(gdO)
-            # ///////////////////////////////////////////////////////////////////////////////
-            # Predicate: Mark indices that need to copy when problem_shape isn't a multiple
-            # of tile_shape
-            # ///////////////////////////////////////////////////////////////////////////////
-            # Construct identity layout for KV
-            cO = cute.make_identity_tensor((self.m_block_size, self.head_dim_v_padded))
             tOcO = gmem_thr_copy_O.partition_S(cO)
             t0OcO = gmem_thr_copy_O.get_slice(0).partition_S(cO)
-            tOpO = utils.predicate_k(tOcO, limit=headdim_v)
-            tOpdO = utils.predicate_k(tOcO, limit=headdim_v)
-            seqlen_q = seqlen.seqlen_q
-            seqlen_q_rounded = cute.round_up(seqlen_q, self.m_block_size)
-            if cutlass.const_expr(mLSE is not None):
-                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
-                    mLSE_cur = mLSE[batch_idx, head_idx, None]
-                else:
-                    mLSE_cur = cute.domain_offset((seqlen.offset_q,), mLSE[head_idx, None])
-                gLSE = cute.local_tile(mLSE_cur, (self.m_block_size,), (m_block,))
-                lse = Float32.inf
-                if tidx < seqlen_q - m_block * self.m_block_size:
-                    lse = gLSE[tidx]
-            tOrO = cute.make_fragment_like(tOgO)
-            tOrdO = cute.make_fragment_like(tOgdO)
-            assert cute.size(tOgO, mode=[0]) == cute.size(tOgdO, mode=[0])
-            assert cute.size(tOgO, mode=[1]) == cute.size(tOgdO, mode=[1])
-            assert cute.size(tOgO, mode=[2]) == cute.size(tOgdO, mode=[2])
             for m in cutlass.range(cute.size(tOrO.shape[1]), unroll_full=True):
-                # Instead of using tOcO, we using t0OcO and subtract the offset from the limit
-                # (seqlen_q - m_block * kBlockM). This is because the entries of t0OcO are known at compile time.
-                if t0OcO[0, m, 0][0] < seqlen_q - m_block * self.m_block_size - tOcO[0][0]:
-                    cute.copy(
-                        gmem_thr_copy_O,
-                        tOgO[None, m, None],
-                        tOrO[None, m, None],
-                        pred=tOpO[None, m, None]
-                        if cutlass.const_expr(self.check_hdim_v_oob)
-                        else None,
-                    )
-                    cute.copy(
-                        gmem_thr_copy_O,
-                        tOgdO[None, m, None],
-                        tOrdO[None, m, None],
-                        pred=tOpdO[None, m, None]
-                        if cutlass.const_expr(self.check_hdim_v_oob)
-                        else None,
-                    )
             # Sum across the "k" dimension
-            dpsum = (tOrO.load().to(Float32) * tOrdO.load().to(Float32)).reduce(
                 cute.ReductionOp.ADD, init_val=0.0, reduction_profile=(0, None, 1)
             )
             threads_per_row = gmem_tiled_copy_O.layout_src_tv_tiled[0].shape[0]
             assert cute.arch.WARP_SIZE % threads_per_row == 0
-            dpsum = utils.warp_reduce(dpsum, operator.add, width=threads_per_row)
-            dP_sum = cute.make_fragment(cute.size(tOrO, mode=[1]), Float32)
-            dP_sum.store(dpsum)
-            # Write dPsum from rmem -> gmem
-            gdPsum = cute.local_tile(mdPsum_cur, (self.m_block_size,), (m_block,))
-            # Only the thread corresponding to column 0 writes out the dPsum to gmem
             if tOcO[0, 0, 0][1] == 0:
-                for m in cutlass.range(cute.size(dP_sum), unroll_full=True):
                     row = tOcO[0, m, 0][0]
-                    gdPsum[row] = dP_sum[m] if row < seqlen_q - m_block * self.m_block_size else 0.0
             # Clear dQaccum
-            if cutlass.const_expr(mdQaccum is not None):
-                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
-                    mdQaccum_cur = mdQaccum[batch_idx, head_idx, None]
-                else:
-                    mdQaccum_cur = cute.domain_offset(
-                        (padded_offset_q * self.head_dim_padded,), mdQaccum[head_idx, None]
-                    )
-                    # HACK: Compiler doesn't seem to recognize that padding
-                    # by padded_offset_q * self.head_dim_padded keeps alignment
-                    # since statically divisible by 4
-                    mdQaccum_cur_ptr = cute.make_ptr(
-                        dtype=mdQaccum_cur.element_type,
-                        value=mdQaccum_cur.iterator.toint(),
-                        mem_space=mdQaccum_cur.iterator.memspace,
-                        assumed_align=mdQaccum.iterator.alignment,
-                    )
-                    mdQaccum_cur = cute.make_tensor(mdQaccum_cur_ptr, mdQaccum_cur.layout)
-                blkdQaccum_shape = (self.m_block_size * self.head_dim_padded,)
                 gdQaccum = cute.local_tile(mdQaccum_cur, blkdQaccum_shape, (m_block,))
                 gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_slice(tidx)
                 tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum)
-                zero = cute.make_fragment_like(tdQgdQaccum)
                 zero.fill(0.0)
                 cute.copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum)
-            if cutlass.const_expr(mLSE is not None):
-                if cutlass.const_expr(not seqlen.has_cu_seqlens_q):
-                    mLSElog2_cur = mLSElog2[batch_idx, head_idx, None]
-                else:
-                    mLSElog2_cur = cute.domain_offset((padded_offset_q,), mLSElog2[head_idx, None])
-                gLSElog2 = cute.local_tile(mLSElog2_cur, (self.m_block_size,), (m_block,))
                 LOG2_E = math.log2(math.e)
-                if tidx < seqlen_q_rounded - m_block * self.m_block_size:
                     gLSElog2[tidx] = lse * LOG2_E if lse != -Float32.inf else 0.0

 # Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
 # A reimplementation of https://github.com/Dao-AILab/flash-attention/blob/main/hopper/flash_bwd_preprocess_kernel.h
 # from Cutlass C++ to Cute-DSL.
+#
+# Computes D_i = (dO_i * O_i).sum(dim=-1), optionally adjusted for LSE gradient:
+#   D'_i = D_i - dLSE_i
+# This works because in the backward pass:
+#   dS_ij = P_ij * (dP_ij - D_i)                     [standard]
+# When LSE is differentiable, d(loss)/d(S_ij) gets an extra term dLSE_i * P_ij
+# (since d(LSE_i)/d(S_ij) = P_ij), giving:
+#   dS_ij = P_ij * (dP_ij - D_i) + dLSE_i * P_ij
+#         = P_ij * (dP_ij - (D_i - dLSE_i))
+# So the main backward kernel is unchanged; we just replace D with D' = D - dLSE here.
 import math
 import operator
+from functools import partial
+from typing import Callable, Type, Optional
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from cutlass.cutlass_dsl import Arch, BaseDSL
+from .quack import copy_utils, layout_utils
 from . import utils
+from .seqlen_info import SeqlenInfo
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import (
     SingleTileScheduler,
         dtype: Type[cutlass.Numeric],
         head_dim: int,
         head_dim_v: int,
+        tile_m: int = 128,
+        num_threads: int = 256,
     ):
         """
         All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
         :param head_dim: head dimension
         :type head_dim: int
+        :param tile_m: m block size
+        :type tile_m: int
         :param num_threads: number of threads
         :type num_threads: int
         """
+        self.use_pdl = BaseDSL._get_dsl().get_arch_enum() >= Arch.sm_90a
         self.dtype = dtype
+        self.tile_m = tile_m
         # padding head_dim to a multiple of 32 as k_block_size
         hdim_multiple_of = 32
         self.head_dim_padded = int(math.ceil(head_dim / hdim_multiple_of) * hdim_multiple_of)
         self.num_threads = num_threads
     @staticmethod
+    def can_implement(dtype, head_dim, tile_m, num_threads) -> bool:
         """Check if the kernel can be implemented with the given parameters.
         :param dtype: data type
         :type dtype: cutlass.Numeric
         :param head_dim: head dimension
         :type head_dim: int
+        :param tile_m: m block size
+        :type tile_m: int
         :param num_threads: number of threads
         :type num_threads: int
             return False
         if num_threads % 32 != 0:
             return False
+        if num_threads < tile_m:  # For multiplying lse with log2
             return False
         return True
         universal_copy_bits = 128
         num_copy_elems_dQaccum = universal_copy_bits // Float32.width
         assert (
+            self.tile_m * self.head_dim_padded // num_copy_elems_dQaccum
         ) % self.num_threads == 0
         self.gmem_tiled_copy_dQaccum = copy_utils.tiled_copy_1d(
             Float32, self.num_threads, num_copy_elems_dQaccum
     @cute.jit
     def __call__(
         self,
+        mO: cute.Tensor,  # (batch, seqlen, nheads, head_dim_v) or (total_q, nheads, head_dim_v)
+        mdO: cute.Tensor,  # same shape as mO
+        mPdPsum: cute.Tensor,  # (batch, nheads, seqlen_padded) or (nheads, total_q_padded)
+        mLSE: Optional[cute.Tensor],  # (batch, nheads, seqlen) or (nheads, total_q)
+        mLSElog2: Optional[cute.Tensor],  # same shape as mPdPsum
+        # (batch, nheads, seqlen_padded * head_dim_v) or (nheads, total_q_padded * head_dim_v)
         mdQaccum: Optional[cute.Tensor],
+        mCuSeqlensQ: Optional[cute.Tensor],  # (batch + 1,)
+        mSeqUsedQ: Optional[cute.Tensor],  # (batch,)
+        mdLSE: Optional[cute.Tensor],  # (batch, nheads, seqlen) or (nheads, total_q)
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         # Get the data type and check if it is fp16 or bf16
+        if const_expr(not (mO.element_type == mdO.element_type)):
             raise TypeError("All tensors must have the same data type")
+        if const_expr(mO.element_type not in [cutlass.Float16, cutlass.BFloat16]):
             raise TypeError("Only Float16 or BFloat16 is supported")
+        if const_expr(mPdPsum.element_type not in [Float32]):
+            raise TypeError("PdPsum tensor must be Float32")
+        if const_expr(mdQaccum is not None):
+            if const_expr(mdQaccum.element_type not in [Float32]):
                 raise TypeError("dQaccum tensor must be Float32")
+        if const_expr(mLSE is not None):
             assert mLSElog2 is not None, "If mLSE is provided, mLSElog2 must also be provided"
+            if const_expr(mLSE.element_type not in [Float32]):
                 raise TypeError("LSE tensor must be Float32")
+            if const_expr(mLSElog2.element_type not in [Float32]):
                 raise TypeError("LSElog2 tensor must be Float32")
+        if const_expr(mdLSE is not None):
+            if const_expr(mdLSE.element_type not in [Float32]):
+                raise TypeError("dLSE tensor must be Float32")
         self._setup_attributes()
+        # (batch, nheads, seqlen) -> (seqlen, nheads, batch) or (total_q, nheads) -> (nheads, total_q)
+        transpose = [2, 1, 0] if const_expr(mCuSeqlensQ is None) else [1, 0]
+        mPdPsum = layout_utils.select(mPdPsum, transpose)
+        if const_expr(mLSE is not None):
+            mLSE = layout_utils.select(mLSE, transpose)
+            mLSElog2 = layout_utils.select(mLSElog2, transpose)
+        if const_expr(mdLSE is not None):
+            mdLSE = layout_utils.select(mdLSE, transpose)
+        if const_expr(mdQaccum is not None):
+            mdQaccum = layout_utils.select(mdQaccum, transpose)
+        if const_expr(mCuSeqlensQ is not None):
             TileScheduler = SingleTileVarlenScheduler
             num_head = mO.shape[1]
             num_batch = mCuSeqlensQ.shape[0] - 1
             num_batch = mO.shape[0]
         tile_sched_args = TileSchedulerArguments(
+            num_block=cute.ceil_div(mO.shape[1], self.tile_m),
             num_head=num_head,
             num_batch=num_batch,
             num_splits=1,
             headdim=0,
             headdim_v=mO.shape[2],
             total_q=mO.shape[0],
+            tile_shape_mn=(self.tile_m, 1),
             mCuSeqlensQ=mCuSeqlensQ,
             mSeqUsedQ=mSeqUsedQ,
         )
         self.kernel(
             mO,
             mdO,
+            mPdPsum,
             mLSE,
             mLSElog2,
             mdQaccum,
             mCuSeqlensQ,
             mSeqUsedQ,
+            mdLSE,
             self.gmem_tiled_copy_O,
             self.gmem_tiled_copy_dQaccum,
             tile_sched_params,
             grid=grid_dim,
             block=[self.num_threads, 1, 1],
             stream=stream,
+            use_pdl=self.use_pdl,
         )
     @cute.kernel
         self,
         mO: cute.Tensor,
         mdO: cute.Tensor,
+        mPdPsum: cute.Tensor,
         mLSE: Optional[cute.Tensor],
         mLSElog2: Optional[cute.Tensor],
         mdQaccum: Optional[cute.Tensor],
         mCuSeqlensQ: Optional[cute.Tensor],
         mSeqUsedQ: Optional[cute.Tensor],
+        mdLSE: Optional[cute.Tensor],
         gmem_tiled_copy_O: cute.TiledCopy,
         gmem_tiled_copy_dQaccum: cute.TiledCopy,
         tile_sched_params: ParamsBase,
             # ///////////////////////////////////////////////////////////////////////////////
             # Get the appropriate tiles for this thread block.
             # ///////////////////////////////////////////////////////////////////////////////
+            seqlen = SeqlenInfo.create(
+                batch_idx, mO.shape[1], mCuSeqlensQ, mSeqUsedQ, tile=self.tile_m
             )
+            mO_cur = seqlen.offset_batch(mO, batch_idx, dim=0)[None, head_idx, None]
+            mdO_cur = seqlen.offset_batch(mdO, batch_idx, dim=0)[None, head_idx, None]
+            mPdPsum_cur = seqlen.offset_batch(mPdPsum, batch_idx, dim=2, padded=True)[
+                None, head_idx
+            ]
+            headdim_v = mO_cur.shape[cute.rank(mO_cur) - 1]
+            seqlen_q = seqlen.seqlen
+            seqlen_q_rounded = cute.round_up(seqlen_q, self.tile_m)
+            seqlen_limit = seqlen_q - m_block * self.tile_m
+            lse = None
+            if const_expr(mLSE is not None):
+                mLSE_cur = seqlen.offset_batch(mLSE, batch_idx, dim=2)[None, head_idx]
+                gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (m_block,))
+                lse = Float32.inf
+                if tidx < seqlen_limit:
+                    lse = gLSE[tidx]
+            blk_shape = (self.tile_m, self.head_dim_v_padded)
+            gO = cute.local_tile(mO_cur, blk_shape, (m_block, 0))
+            gdO = cute.local_tile(mdO_cur, blk_shape, (m_block, 0))
             gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
             # (CPY_Atom, CPY_M, CPY_K)
             tOgO = gmem_thr_copy_O.partition_S(gO)
             tOgdO = gmem_thr_copy_O.partition_S(gdO)
+            cO = cute.make_identity_tensor(blk_shape)
             tOcO = gmem_thr_copy_O.partition_S(cO)
             t0OcO = gmem_thr_copy_O.get_slice(0).partition_S(cO)
+            tOpO = None
+            if const_expr(self.check_hdim_v_oob):
+                tOpO = copy_utils.predicate_k(tOcO, limit=headdim_v)
+            # Each copy will use the same predicate
+            copy = partial(copy_utils.copy, pred=tOpO)
+            tOrO = cute.make_rmem_tensor_like(tOgO)
+            tOrdO = cute.make_rmem_tensor_like(tOgdO)
+            if const_expr(self.check_hdim_v_oob):
+                tOrO.fill(0.0)
+                tOrdO.fill(0.0)
+            assert tOgO.shape == tOgdO.shape
             for m in cutlass.range(cute.size(tOrO.shape[1]), unroll_full=True):
+                # Instead of using tOcO, we using t0OcO and subtract the offset from the limit.
+                # This is bc the entries of t0OcO are known at compile time.
+                if t0OcO[0, m, 0][0] < seqlen_limit - tOcO[0][0]:
+                    copy(tOgO[None, m, None], tOrO[None, m, None])
+                    copy(tOgdO[None, m, None], tOrdO[None, m, None])
+            # O and dO loads are done; signal that the next kernel can start.
+            # Correctness is ensured by griddepcontrol_wait() in bwd_sm90 before it reads our outputs.
+            if const_expr(self.use_pdl):
+                cute.arch.griddepcontrol_launch_dependents()
             # Sum across the "k" dimension
+            pdpsum = (tOrO.load().to(Float32) * tOrdO.load().to(Float32)).reduce(
                 cute.ReductionOp.ADD, init_val=0.0, reduction_profile=(0, None, 1)
             )
             threads_per_row = gmem_tiled_copy_O.layout_src_tv_tiled[0].shape[0]
             assert cute.arch.WARP_SIZE % threads_per_row == 0
+            pdpsum = utils.warp_reduce(pdpsum, operator.add, width=threads_per_row)
+            PdP_sum = cute.make_rmem_tensor(cute.size(tOrO, mode=[1]), Float32)
+            PdP_sum.store(pdpsum)
+            # If dLSE is provided, compute D' = D - dLSE (see module docstring for derivation).
+            gdLSE = None
+            if const_expr(mdLSE is not None):
+                mdLSE_cur = seqlen.offset_batch(mdLSE, batch_idx, dim=2)[None, head_idx]
+                gdLSE = cute.local_tile(mdLSE_cur, (self.tile_m,), (m_block,))
+            # Write PdPsum from rmem -> gmem
+            gPdPsum = cute.local_tile(mPdPsum_cur, (self.tile_m,), (m_block,))
+            # Only the thread corresponding to column 0 writes out the PdPsum to gmem
             if tOcO[0, 0, 0][1] == 0:
+                for m in cutlass.range(cute.size(PdP_sum), unroll_full=True):
                     row = tOcO[0, m, 0][0]
+                    PdPsum_val = 0.0
+                    if row < seqlen_limit:
+                        PdPsum_val = PdP_sum[m]
+                        if const_expr(mdLSE is not None):
+                            PdPsum_val -= gdLSE[row]
+                    gPdPsum[row] = PdPsum_val
             # Clear dQaccum
+            if const_expr(mdQaccum is not None):
+                mdQaccum_cur = seqlen.offset_batch(
+                    mdQaccum, batch_idx, dim=2, padded=True, multiple=self.head_dim_padded
+                )[None, head_idx]
+                blkdQaccum_shape = (self.tile_m * self.head_dim_padded,)
                 gdQaccum = cute.local_tile(mdQaccum_cur, blkdQaccum_shape, (m_block,))
                 gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_slice(tidx)
                 tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum)
+                zero = cute.make_rmem_tensor_like(tdQgdQaccum)
                 zero.fill(0.0)
                 cute.copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum)
+            if const_expr(mLSE is not None):
+                mLSElog2_cur = seqlen.offset_batch(mLSElog2, batch_idx, dim=2, padded=True)[
+                    None, head_idx
+                ]
+                gLSElog2 = cute.local_tile(mLSElog2_cur, (self.tile_m,), (m_block,))
                 LOG2_E = math.log2(math.e)
+                if tidx < seqlen_q_rounded - m_block * self.tile_m:
                     gLSElog2[tidx] = lse * LOG2_E if lse != -Float32.inf else 0.0

build/torch-cuda/flash_bwd_sm100.py CHANGED Viewed

@@ -84,7 +84,6 @@ class FlashAttentionBackwardSm100:
         self.use_2cta_instrs = bool(
             use_2cta_instrs
             and cluster_size == 2
-            and not is_local
             and score_mod is None
             and score_mod_bwd is None
             and mask_mod is None
@@ -453,7 +452,6 @@ class FlashAttentionBackwardSm100:
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: Float32,
-        stream: cuda.CUstream,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
@@ -467,6 +465,8 @@ class FlashAttentionBackwardSm100:
         aux_tensors: Optional[list] = None,
         # Block-sparse tensors (Q direction - for iterating m_blocks per n_block):
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
     ):
         self.q_dtype = mQ.element_type
         self.k_dtype = mK.element_type
@@ -927,10 +927,6 @@ class FlashAttentionBackwardSm100:
                 "2-CTA mode does not support block sparsity. "
                 "Please create kernel with use_2cta_instrs=False for block sparse attention."
             )
-            assert window_size_left is None and window_size_right is None, (
-                "2-CTA mode does not support window attention. "
-                "Please create kernel with use_2cta_instrs=False for window attention."
-            )
         # 2-CTA: 231424 and 1-CTA: 232448
         # print("SMEM: ", self.shared_storage.size_in_bytes())
         if const_expr(self.use_block_sparsity or aux_tensors is not None):
@@ -3143,6 +3139,8 @@ class FlashAttentionBackwardSm100:
                     with cute.arch.elect_one():
                         pipeline_S_P.consumer_release(consumer_state_S_P_dP)
                         # pipeline_S_P.sync_object_empty.arrive(0, pipeline_S_P.consumer_mask)
                 pipeline_LSE.consumer_release(consumer_state_LSE)
                 consumer_state_LSE.advance()
                 # ---------------------------------------------
@@ -3253,6 +3251,8 @@ class FlashAttentionBackwardSm100:
                 cute.arch.fence_view_async_shared()
                 self.compute_sync_barrier.arrive_and_wait()
                 pipeline_dPsum.consumer_release(consumer_state_dPsum)
                 consumer_state_dPsum.advance()
                 # when 2cta hdim 128, pipeline_dS also signals S tmem load completion so is deferred
@@ -3650,6 +3650,9 @@ class FlashAttentionBackwardSm100:
             tile_scheduler.advance_to_next_work()
             work_tile = tile_scheduler.get_current_work()
     @cute.jit
     def epilogue_dKV(
         self,

         self.use_2cta_instrs = bool(
             use_2cta_instrs
             and cluster_size == 2
             and score_mod is None
             and score_mod_bwd is None
             and mask_mod is None
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: Float32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         aux_tensors: Optional[list] = None,
         # Block-sparse tensors (Q direction - for iterating m_blocks per n_block):
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         self.q_dtype = mQ.element_type
         self.k_dtype = mK.element_type
                 "2-CTA mode does not support block sparsity. "
                 "Please create kernel with use_2cta_instrs=False for block sparse attention."
             )
         # 2-CTA: 231424 and 1-CTA: 232448
         # print("SMEM: ", self.shared_storage.size_in_bytes())
         if const_expr(self.use_block_sparsity or aux_tensors is not None):
                     with cute.arch.elect_one():
                         pipeline_S_P.consumer_release(consumer_state_S_P_dP)
                         # pipeline_S_P.sync_object_empty.arrive(0, pipeline_S_P.consumer_mask)
+                # Normally we'd need syncwarp here since only 1 thread will signal in
+                # consumer_release, but we already have the self.compute_sync_barrier before this
                 pipeline_LSE.consumer_release(consumer_state_LSE)
                 consumer_state_LSE.advance()
                 # ---------------------------------------------
                 cute.arch.fence_view_async_shared()
                 self.compute_sync_barrier.arrive_and_wait()
+                # Normally we'd need syncwarp here since only 1 thread will signal in
+                # consumer_release, but we already have the self.compute_sync_barrier before this
                 pipeline_dPsum.consumer_release(consumer_state_dPsum)
                 consumer_state_dPsum.advance()
                 # when 2cta hdim 128, pipeline_dS also signals S tmem load completion so is deferred
             tile_scheduler.advance_to_next_work()
             work_tile = tile_scheduler.get_current_work()
+        if const_expr(not self.deterministic):
+            cute.arch.cp_async_bulk_wait_group(0, read=True)
     @cute.jit
     def epilogue_dKV(
         self,

build/torch-cuda/flash_bwd_sm120.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# SM120 (Blackwell GeForce / DGX Spark) backward pass.
+#
+# SM120 uses the same SM80-era MMA instructions (mma.sync.aligned.m16n8k16) but has
+# a smaller shared memory capacity (99 KB vs 163 KB on SM80). This module subclasses
+# FlashAttentionBackwardSm80 and overrides the SMEM capacity check accordingly.
+import cutlass
+import cutlass.utils as utils_basic
+from .flash_bwd import FlashAttentionBackwardSm80
+class FlashAttentionBackwardSm120(FlashAttentionBackwardSm80):
+    @staticmethod
+    def can_implement(
+        dtype,
+        head_dim,
+        head_dim_v,
+        m_block_size,
+        n_block_size,
+        num_stages_Q,
+        num_stages_dO,
+        num_threads,
+        is_causal,
+        V_in_regs=False,
+    ) -> bool:
+        """Check if the kernel can be implemented on SM120.
+        Same logic as SM80 but uses SM120's shared memory capacity (99 KB).
+        """
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if head_dim_v % 8 != 0:
+            return False
+        if n_block_size % 16 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        # Shared memory usage: Q tile + dO tile + K tile + V tile
+        smem_usage_Q = m_block_size * head_dim * num_stages_Q * 2
+        smem_usage_dO = m_block_size * head_dim_v * num_stages_dO * 2
+        smem_usage_K = n_block_size * head_dim * 2
+        smem_usage_V = n_block_size * head_dim_v * 2
+        smem_usage_QV = (
+            (smem_usage_Q + smem_usage_V) if not V_in_regs else max(smem_usage_Q, smem_usage_V)
+        )
+        smem_usage = smem_usage_QV + smem_usage_dO + smem_usage_K
+        # SM120 has 99 KB shared memory (vs 163 KB on SM80)
+        smem_capacity = utils_basic.get_smem_capacity_in_bytes("sm_120")
+        if smem_usage > smem_capacity:
+            return False
+        return True

build/torch-cuda/flash_bwd_sm90.py CHANGED Viewed

@@ -24,7 +24,13 @@ from .seqlen_info import SeqlenInfoQK
 from .block_info import BlockInfo
 from . import pipeline
 from .quack.cute_dsl_utils import ParamsBase
-from .tile_scheduler import TileSchedulerArguments, SingleTileScheduler
 from .named_barrier import NamedBarrierBwd
 from .softmax import apply_score_mod_inner, apply_score_mod_bwd_inner
 from .block_sparsity import BlockSparseTensors
@@ -46,6 +52,8 @@ class FlashAttentionBackwardSm90:
         head_dim_v: Optional[int] = None,
         qhead_per_kvhead: int = 1,
         is_causal: bool = False,
         tile_m: int = 64,
         tile_n: int = 128,
         Q_stage: int = 2,
@@ -64,6 +72,7 @@ class FlashAttentionBackwardSm90:
         mask_mod: cutlass.Constexpr | None = None,
         has_aux_tensors: cutlass.Constexpr = False,
         subtile_factor: cutlass.Constexpr[int] = 1,
     ):
         self.dtype = dtype
         # padding head_dim to a multiple of 16 as k_block_size
@@ -77,7 +86,8 @@ class FlashAttentionBackwardSm90:
         self.check_hdim_v_oob = head_dim_v != self.tile_hdimv
         self.qhead_per_kvhead = qhead_per_kvhead
         self.is_causal = is_causal
-        self.is_local = False
         self.tile_m = tile_m
         self.tile_n = tile_n
         self.num_threads = num_threads
@@ -92,23 +102,23 @@ class FlashAttentionBackwardSm90:
         self.AtomLayoutMSdP = AtomLayoutMSdP
         self.AtomLayoutNdKV = AtomLayoutNdKV
         self.AtomLayoutMdQ = AtomLayoutMdQ
-        self.num_mma_warp_groups = (self.num_threads // 128) - 1
         self.mma_dkv_is_rs = (
             AtomLayoutMSdP == 1
-            and AtomLayoutNdKV == self.num_mma_warp_groups
             and SdP_swapAB
             and not dKV_swapAB
         )
         self.V_in_regs = V_in_regs
         if qhead_per_kvhead > 1:
             assert self.same_hdim_kv, "GQA backward requires head_dim == head_dim_v"
-            assert self.num_mma_warp_groups == 2, "GQA backward assumes 2 warp groups"
         # These are tuned for speed
         # Do we keep the LSE and dPsum in each thread, or split them across 8 threads that share
         # them and then shuffle to get the value whenever we need? This can reduce register
         # pressure when SdP_swapAB, where each thread needs to keep statistics for (kBlockM / 4)
         # rows. If !SdP_swapAB, each thread only needs to keep statistics for 2 rows.
-        # TODO: impl these for hdim 64
         self.shuffle_LSE = self.SdP_swapAB and self.tile_hdim <= 64
         self.shuffle_dPsum = self.SdP_swapAB and self.tile_hdim <= 64
@@ -124,6 +134,12 @@ class FlashAttentionBackwardSm90:
         else:
             self.vec_size: cutlass.Constexpr = 4
         self.qk_acc_dtype = Float32
     @staticmethod
     def can_implement(
@@ -182,32 +198,58 @@ class FlashAttentionBackwardSm90:
         assert mQ_type == self.dtype
     def _setup_attributes(self):
-        self.sQ_layout, self.sK_layout, self.sV_layout, self.sdO_layout, self.sPdS_layout = [
-            sm90_utils.make_smem_layout(self.dtype, LayoutEnum.ROW_MAJOR, shape, stage)
-            for shape, stage in [
-                ((self.tile_m, self.tile_hdim), self.Q_stage),
-                ((self.tile_n, self.tile_hdim), None),
-                ((self.tile_n, self.tile_hdimv), None),
-                ((self.tile_m, self.tile_hdimv), self.dO_stage),
-                ((self.tile_m, self.tile_n), self.PdS_stage),
             ]
         ]
         self.sdQaccum_layout = cute.make_layout(
-            (self.tile_m * self.tile_hdim // self.num_mma_warp_groups, self.num_mma_warp_groups)
         )
         # dQaccum R->S
         self.r2s_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
             cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
             # thr_layout
-            cute.make_layout((self.num_threads_per_warp_group, self.num_mma_warp_groups)),
             cute.make_layout(128 // Float32.width),  # val_layout
         )
         # dKVaccum for GQA epilogue - reuses sV+sK memory recast as f32
         # TODO: assert that sVaccum and sKaccum don't overflow smem
     def _get_tiled_mma(self):
         # S = Q @ K.T, dP = dO @ V.T
-        atom_layout_SdP = (self.AtomLayoutMSdP, self.num_mma_warp_groups // self.AtomLayoutMSdP)
         tiler_mn_SdP = (self.tile_m // atom_layout_SdP[0], self.tile_n // atom_layout_SdP[1])
         tiled_mma_SdP = sm90_utils_basic.make_trivial_tiled_mma(
             self.dtype,
@@ -215,12 +257,11 @@ class FlashAttentionBackwardSm90:
             warpgroup.OperandMajorMode.K,
             warpgroup.OperandMajorMode.K,
             Float32,
-            atom_layout_mnk=(atom_layout_SdP if not self.SdP_swapAB else atom_layout_SdP[::-1])
-            + (1,),
-            tiler_mn=tiler_mn_SdP if not self.SdP_swapAB else tiler_mn_SdP[::-1],
         )
         # dV = P.T @ dO, dK = dS.T @ Q
-        atom_layout_dKV = (self.AtomLayoutNdKV, self.num_mma_warp_groups // self.AtomLayoutNdKV)
         tiler_mn_dK = (self.tile_n // atom_layout_dKV[0], self.tile_hdim // atom_layout_dKV[1])
         tiler_mn_dV = (self.tile_n // atom_layout_dKV[0], self.tile_hdimv // atom_layout_dKV[1])
         tiled_mma_dK, tiled_mma_dV = [
@@ -232,9 +273,8 @@ class FlashAttentionBackwardSm90:
                 else warpgroup.OperandMajorMode.K,
                 warpgroup.OperandMajorMode.MN,
                 Float32,
-                atom_layout_mnk=(atom_layout_dKV if not self.dKV_swapAB else atom_layout_dKV[::-1])
-                + (1,),
-                tiler_mn=tiler_mn_d if not self.dKV_swapAB else tiler_mn_d[::-1],
                 a_source=warpgroup.OperandSource.RMEM
                 if self.mma_dkv_is_rs
                 else warpgroup.OperandSource.SMEM,
@@ -242,7 +282,8 @@ class FlashAttentionBackwardSm90:
             for tiler_mn_d in (tiler_mn_dK, tiler_mn_dV)
         ]
         # dQ = dS @ K
-        atom_layout_dQ = (self.AtomLayoutMdQ, self.num_mma_warp_groups // self.AtomLayoutMdQ)
         tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
         tiled_mma_dQ = sm90_utils_basic.make_trivial_tiled_mma(
             self.dtype,
@@ -250,8 +291,8 @@ class FlashAttentionBackwardSm90:
             warpgroup.OperandMajorMode.K if not self.dQ_swapAB else warpgroup.OperandMajorMode.MN,
             warpgroup.OperandMajorMode.MN if not self.dQ_swapAB else warpgroup.OperandMajorMode.K,
             Float32,
-            atom_layout_mnk=(atom_layout_dQ if not self.dQ_swapAB else atom_layout_dQ[::-1]) + (1,),
-            tiler_mn=tiler_mn_dQ if not self.dQ_swapAB else tiler_mn_dQ[::-1],
         )
         return tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ
@@ -305,7 +346,6 @@ class FlashAttentionBackwardSm90:
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: Float32,
-        stream: cuda.CUstream,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
@@ -318,10 +358,13 @@ class FlashAttentionBackwardSm90:
         mdV_semaphore: Optional[cute.Tensor] = None,
         aux_tensors: Optional[list] = None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
     ):
-        assert mdQ_semaphore is None and mdK_semaphore is None and mdV_semaphore is None, (
-            "determinism not supported yet for Sm90"
-        )
         self._check_type(
             *(
@@ -330,23 +373,36 @@ class FlashAttentionBackwardSm90:
             )
         )
         mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV = [
             assume_tensor_aligned(t) for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV)
         ]
-        layout_transpose = [1, 3, 2, 0]  # (b, s, n, h) --> (s, h, n, b)
-        mQ, mK, mV, mdO = [layout_utils.select(t, layout_transpose) for t in (mQ, mK, mV, mdO)]
         if const_expr(self.qhead_per_kvhead == 1):
-            mdK, mdV = [layout_utils.select(t, layout_transpose) for t in (mdK, mdV)]
         else:
-            accum_transpose = [2, 1, 0]  # (b, n, s*h) -> (s*h, n, b)
             mdK, mdV = [layout_utils.select(t, accum_transpose) for t in (mdK, mdV)]
-        LSE_dPsum_dQaccum_transpose = [2, 1, 0]  # (b, n, s) -> (s, n, b)
         mLSE, mdPsum, mdQaccum = [
             layout_utils.select(t, LSE_dPsum_dQaccum_transpose) for t in (mLSE, mdPsum, mdQaccum)
         ]
         tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ = self._get_tiled_mma()
         self.num_mma_threads = tiled_mma_SdP.size
         assert self.num_mma_threads + 128 == self.num_threads
@@ -354,10 +410,25 @@ class FlashAttentionBackwardSm90:
         self.num_threads_per_warp_group = 128
         self.num_producer_threads = 32
-        self.num_mma_regs = 240
-        self.num_producer_regs = 24
-        # self.num_mma_regs = 232
-        # self.num_producer_regs = 40
         self._setup_attributes()
         SharedStorage = self._get_shared_storage_cls()
@@ -374,7 +445,7 @@ class FlashAttentionBackwardSm90:
         self.tma_copy_bytes["LSE"] = self.tile_m * Float32.width // 8
         self.tma_copy_bytes["dPsum"] = self.tile_m * Float32.width // 8
         self.tma_copy_bytes["dQ"] = (
-            self.tile_m * self.tile_hdim * Float32.width // 8 // self.num_mma_warp_groups
         )
         self.tma_copy_bytes["dKacc"] = self.tile_n * self.tile_hdim * Float32.width // 8
         self.tma_copy_bytes["dVacc"] = self.tile_n * self.tile_hdimv * Float32.width // 8
@@ -404,38 +475,59 @@ class FlashAttentionBackwardSm90:
             (self.tile_m, self.tile_hdimv),
         )
         if const_expr(self.qhead_per_kvhead == 1):
             tma_atom_dK, tma_tensor_dK = cpasync.make_tiled_tma_atom(
                 cpasync.CopyBulkTensorTileS2GOp(),
-                mdK,
                 cute.select(self.sK_layout, mode=[0, 1]),
                 (self.tile_n, self.tile_hdim),
             )
             tma_atom_dV, tma_tensor_dV = cpasync.make_tiled_tma_atom(
                 cpasync.CopyBulkTensorTileS2GOp(),
-                mdV,
                 cute.select(self.sV_layout, mode=[0, 1]),
                 (self.tile_n, self.tile_hdimv),
             )
         else:
             tma_atom_dK = tma_atom_dV = tma_tensor_dK = tma_tensor_dV = None
-        TileScheduler = SingleTileScheduler
         tile_sched_args = TileSchedulerArguments(
             cute.ceil_div(cute.size(mK.shape[0]), self.tile_n),
             cute.size(mQ.shape[2]),
-            cute.size(mQ.shape[3]),
             1,  # num_splits
-            cute.size(mK.shape[0]),
-            mQ.shape[1],
-            mV.shape[1],
-            total_q=cute.size(mQ.shape[0]) * cute.size(mQ.shape[3]),
-            tile_shape_mn=(self.tile_m, self.tile_n),
-            mCuSeqlensQ=None,
-            mSeqUsedQ=None,
             qhead_per_kvhead_packgqa=1,
             element_size=self.dtype.width // 8,
             is_persistent=False,
-            lpt=False,
         )
         tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
@@ -461,6 +553,11 @@ class FlashAttentionBackwardSm90:
         self.use_block_sparsity = cutlass.const_expr(blocksparse_tensors is not None)
         self.kernel(
             tma_tensor_Q,
             tma_tensor_K,
@@ -477,6 +574,10 @@ class FlashAttentionBackwardSm90:
             mLSE,
             mdPsum,
             mdQaccum,
             self.sQ_layout,
             self.sK_layout,
             self.sV_layout,
@@ -497,11 +598,15 @@ class FlashAttentionBackwardSm90:
             fastdiv_mods,
             blocksparse_tensors,
             qhead_per_kvhead_divmod,
         ).launch(
             grid=grid_dim,
             block=[self.num_threads, 1, 1],
             stream=stream,
             min_blocks_per_mp=1,
         )
     @cute.kernel
@@ -522,6 +627,10 @@ class FlashAttentionBackwardSm90:
         mLSE: cute.Tensor,
         mdPsum: cute.Tensor,
         mdQaccum: cute.Tensor,
         sQ_layout: cute.ComposedLayout,
         sK_layout: cute.ComposedLayout,
         sV_layout: cute.ComposedLayout,
@@ -542,15 +651,17 @@ class FlashAttentionBackwardSm90:
         fastdiv_mods=(None, None),
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
     ):
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         # prefetch TMA descriptors
         if warp_idx == 0:
-            cpasync.prefetch_descriptor(tma_atom_Q)
-            cpasync.prefetch_descriptor(tma_atom_K)
-            cpasync.prefetch_descriptor(tma_atom_V)
-            cpasync.prefetch_descriptor(tma_atom_dO)
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(SharedStorage)
@@ -604,25 +715,27 @@ class FlashAttentionBackwardSm90:
             self.is_causal,
             self.is_local,
             False,  # is_split_kv
-            None,
-            None,
             qhead_per_kvhead_packgqa=1,
         )
         SeqlenInfoCls = partial(
             SeqlenInfoQK.create,
             seqlen_q_static=mQ.shape[0],
             seqlen_k_static=mK.shape[0],
-            mCuSeqlensQ=None,
-            mCuSeqlensK=None,
-            mSeqUsedQ=None,
-            mSeqUsedK=None,
         )
         AttentionMaskCls = partial(
             AttentionMask,
             self.tile_m,
             self.tile_n,
-            window_size_left=None,
-            window_size_right=None,
             swap_AB=self.SdP_swapAB,
         )
         TileSchedulerCls = partial(TileScheduler.create, tile_sched_params)
@@ -663,12 +776,12 @@ class FlashAttentionBackwardSm90:
                     TileSchedulerCls,
                     SeqlenInfoCls,
                     blocksparse_tensors,
                 )
         else:
-            cute.arch.setmaxregister_increase(self.num_mma_regs)
             tidx, _, _ = cute.arch.thread_idx()
             tidx = tidx - 128
-            self.mma(
                 tiled_mma_SdP,
                 tiled_mma_dK,
                 tiled_mma_dV,
@@ -702,6 +815,19 @@ class FlashAttentionBackwardSm90:
                 blocksparse_tensors,
                 qhead_per_kvhead_divmod,
             )
     @cute.jit
     def load(
@@ -749,18 +875,22 @@ class FlashAttentionBackwardSm90:
                     if const_expr(self.qhead_per_kvhead == 1)
                     else head_idx // qhead_per_kvhead_divmod
                 )
-                mK_cur = mK[None, None, head_idx_kv, batch_idx]
                 gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
-                mV_cur = mV[None, None, head_idx_kv, batch_idx]
                 gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
-                mQ_cur = mQ[None, None, head_idx, batch_idx]
                 gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (None, 0))
-                mdO_cur = mdO[None, None, head_idx, batch_idx]
                 gdO = cute.local_tile(mdO_cur, (self.tile_m, self.tile_hdimv), (None, 0))
-                mLSE_cur = mLSE[None, head_idx, batch_idx]
                 gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (None,))
-                mdPsum_cur = mdPsum[None, head_idx, batch_idx]
                 gdPsum = cute.local_tile(mdPsum_cur, (self.tile_m,), (None,))
                 load_K, _, _ = copy_utils.tma_get_copy_fn(
@@ -786,7 +916,10 @@ class FlashAttentionBackwardSm90:
                 if const_expr(not self.use_block_sparsity):
                     total_m_block_cnt = m_block_max - m_block_min
-                    process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
                 else:
                     total_m_block_cnt = get_total_q_block_count_bwd(
                         blocksparse_tensors,
@@ -806,6 +939,8 @@ class FlashAttentionBackwardSm90:
                         )
                         load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q))
                         load_Q(first_m_block, producer_state=producer_state_Q)
                         load_LSE(first_m_block, producer_state=producer_state_Q)
                         producer_state_dO_cur = (
                             producer_state_dO
@@ -984,16 +1119,20 @@ class FlashAttentionBackwardSm90:
         fastdiv_mods=(None, None),
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
     ):
         warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
         warp_group_thread_layout = cute.make_layout(
-            self.num_mma_warp_groups, stride=self.num_threads_per_warp_group
         )
         thr_mma_SdP = tiled_mma_SdP.get_slice(tidx)
         wg_mma_SdP = tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx))
         wg_mma_dK = tiled_mma_dK.get_slice(warp_group_thread_layout(warp_group_idx))
         wg_mma_dV = tiled_mma_dV.get_slice(warp_group_thread_layout(warp_group_idx))
-        wg_mma_dQ = tiled_mma_dQ.get_slice(warp_group_thread_layout(warp_group_idx))
         # S = Q @ K.T
         shape_mnk_S = (self.tile_m, self.tile_n, self.tile_hdim)
         _, tSrQ, tSrK = sm90_utils.partition_fragment_ABC(
@@ -1039,23 +1178,43 @@ class FlashAttentionBackwardSm90:
         # dQ = dS @ K
         sKt = layout_utils.transpose_view(sK)
         shape_mnk_dQ = (self.tile_m, self.tile_hdim, self.tile_n)
-        _, tdQrdS, tdQrKt = sm90_utils.partition_fragment_ABC(
-            wg_mma_dQ, shape_mnk_dQ, sdS, sKt, swap_AB=self.dQ_swapAB
-        )
-        mma_dsk_fn = partial(
-            gemm_zero_init, tiled_mma_dQ, shape_mnk_dQ[:2], tdQrdS, tdQrKt, swap_AB=self.dQ_swapAB
-        )
-        # Smem copy atom tiling
         copy_P_r2s = None
         if const_expr(sP is not None):
             sP_cpy = sP if const_expr(not self.SdP_swapAB) else sPt
             copy_P_r2s, _, _ = copy_utils.get_smem_store_C(
-                tiled_mma_SdP, sP_cpy, tidx, self.arch, transpose=self.SdP_swapAB
             )
         sdS_cpy = sdS if const_expr(not self.SdP_swapAB) else sdSt
         copy_dS_r2s, _, _ = copy_utils.get_smem_store_C(
-            tiled_mma_SdP, sdS_cpy, tidx, self.arch, transpose=self.SdP_swapAB
         )
         tLSEsLSE = layout_utils.mma_partition_C_vec(
@@ -1064,9 +1223,21 @@ class FlashAttentionBackwardSm90:
         tLSEsdPsum = layout_utils.mma_partition_C_vec(
             sdPsum, thr_mma_SdP, expand_shape=self.tile_n, is_colvec=not self.SdP_swapAB
         )
-        smem_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_slice(tidx)
-        tdQsdQaccum = smem_thr_copy_dQaccum.partition_D(sdQaccum)
         PdS_barrier = cutlass.pipeline.NamedBarrier(
             barrier_id=int(NamedBarrierBwd.PdS), num_threads=self.num_mma_threads
@@ -1105,6 +1276,7 @@ class FlashAttentionBackwardSm90:
             PdS_barrier=PdS_barrier,
             # acc_dV=acc_dV,
             # acc_dK=acc_dK,
         )
         consumer_state_Q = cutlass.pipeline.make_pipeline_state(
@@ -1136,7 +1308,10 @@ class FlashAttentionBackwardSm90:
             m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
             if const_expr(not self.use_block_sparsity):
-                process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
             else:
                 total_m_block_cnt = get_total_q_block_count_bwd(
                     blocksparse_tensors,
@@ -1218,8 +1393,8 @@ class FlashAttentionBackwardSm90:
                     qhead_per_kvhead_divmod,
                 )
             else:
-                # Block sparsity: KV tile with zero Q blocks produces no dK/dV; write zeros.
-                if const_expr(self.use_block_sparsity):
                     acc_dK.fill(0.0)
                     acc_dV.fill(0.0)
                     self.epilogue_dKV(
@@ -1248,6 +1423,22 @@ class FlashAttentionBackwardSm90:
         if warp_idx == 4:
             cute.arch.cp_async_bulk_wait_group(0, read=True)
     @cute.jit
     def mma_one_m_block(
         self,
@@ -1266,16 +1457,17 @@ class FlashAttentionBackwardSm90:
         pipeline_dO: cutlass.pipeline.PipelineAsync,
         tLSEsLSE: cute.Tensor,
         tLSEsdPsum: cute.Tensor,
-        tdQsdQaccum: cute.Tensor,
         softmax_scale_log2: Float32,
         PdS_barrier: cutlass.pipeline.NamedBarrier,
         mask_fn: Optional[Callable] = None,
         score_mod_fn: Optional[Callable] = None,
         score_mod_bwd_fn: Optional[Callable] = None,
         dKV_accumulate: Boolean = True,
     ):
         consumer_state_dO_cur = (
-            consumer_state_dO if const_expr(self.Q_stage == self.dO_stage) else consumer_state_Q
         )
         smem_idx_Q = consumer_state_Q.index
         smem_idx_dO = consumer_state_dO_cur.index if const_expr(self.dO_stage > 1) else 0
@@ -1283,6 +1475,7 @@ class FlashAttentionBackwardSm90:
         # (1) [GEMM 1] S = Q @ K^T
         pipeline_Q.consumer_wait(consumer_state_Q, pipeline_Q.consumer_try_wait(consumer_state_Q))
         acc_S = mma_qk_fn(A_idx=smem_idx_Q, wg_wait=-1)
         tLSErLSE = copy_utils.load_s2r(tLSEsLSE[None, smem_idx_Q])
         # (2) [GEMM 2] dP = dO @ V.T
         pipeline_dO.consumer_wait(
@@ -1301,10 +1494,12 @@ class FlashAttentionBackwardSm90:
         if cutlass.const_expr(mask_fn is not None):
             mask_fn(acc_S, m_block=m_block)
         acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S, transpose=self.SdP_swapAB)
         for r in cutlass.range_constexpr(cute.size(acc_S_mn, mode=[0])):
             for c in cutlass.range(cute.size(acc_S_mn, mode=[1]), unroll_full=True):
                 acc_S_mn[r, c] = cute.math.exp2(
-                    acc_S_mn[r, c] * softmax_scale_log2 - tLSErLSE[r], fastmath=True
                 )
         tLSErdPsum = copy_utils.load_s2r(tLSEsdPsum[None, smem_idx_dO])
@@ -1321,8 +1516,9 @@ class FlashAttentionBackwardSm90:
         warpgroup.wait_group(0)
         acc_dP_mn = layout_utils.reshape_acc_to_mn(acc_dP, transpose=self.SdP_swapAB)
         for r in cutlass.range_constexpr(cute.size(acc_dP_mn, mode=[0])):
             for c in cutlass.range(cute.size(acc_dP_mn, mode=[1]), unroll_full=True):
-                acc_dP_mn[r, c] = acc_S_mn[r, c] * (acc_dP_mn[r, c] - tLSErdPsum[r])
         if const_expr(self.score_mod_bwd is not None):
             score_mod_bwd_fn(acc_dP, acc_S_pre, m_block=m_block)
@@ -1354,36 +1550,50 @@ class FlashAttentionBackwardSm90:
         # smem fence to make sure sdS is written before it's read by WGMMA
         cute.arch.fence_view_async_shared()
         PdS_barrier.arrive_and_wait()
-        # (6) [GEMM 4] dQ = dS @ K
-        acc_dQ = mma_dsk_fn(A_idx=smem_idx_PdS, wg_wait=1)
-        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dV)
-        pipeline_dO.consumer_release(consumer_state_dO_cur)  # release dO as dV mma is done
-        # (7) [GEMM 5] dK += dS.T @ Q
-        if const_expr(not self.mma_dkv_is_rs):
-            mma_dsq_fn(
-                A_idx=smem_idx_PdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1
-            )
-        else:
-            mma_dsq_fn(tCrA=tdKrdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1)
-        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dQ)
-        cute.arch.barrier(
-            barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
-            number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
-        )
-        tdQrdQaccum_flat = cute.make_tensor(acc_dQ.iterator, cute.make_layout(tdQsdQaccum.shape))
-        cute.autovec_copy(tdQrdQaccum_flat, tdQsdQaccum)
-        cute.arch.fence_view_async_shared()
-        cute.arch.barrier_arrive(
-            barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
-            number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
-        )
-        warpgroup.wait_group(0)
-        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(acc_dK)
-        pipeline_Q.consumer_release(consumer_state_Q)
-        # if cute.arch.thread_idx()[0] % 32 == 0: cute.printf("tidx = {}, m_block = {}, after pipeline_Q consumer release", cute.arch.thread_idx()[0], m_block)
         consumer_state_Q.advance()
         consumer_state_dO.advance()
@@ -1415,8 +1625,12 @@ class FlashAttentionBackwardSm90:
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         if const_expr(self.qhead_per_kvhead == 1):
-            mdV_cur = mdV[None, None, head_idx, batch_idx]
-            mdK_cur = mdK[None, None, head_idx, batch_idx]
             gdK = cute.local_tile(mdK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
             gdV = cute.local_tile(mdV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
             store_dK, _, _ = copy_utils.tma_get_copy_fn(
@@ -1428,10 +1642,20 @@ class FlashAttentionBackwardSm90:
             sdV = sV if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sV)
             sdK = sK if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sK)
             copy_dV_r2s, _, _ = copy_utils.get_smem_store_C(
-                tiled_mma_dV, sdV, tidx, self.arch, transpose=self.dKV_swapAB
             )
             copy_dK_r2s, _, _ = copy_utils.get_smem_store_C(
-                tiled_mma_dK, sdK, tidx, self.arch, transpose=self.dKV_swapAB
             )
             cute.arch.cp_async_bulk_wait_group(1, read=True)
             epi_barrier.arrive_and_wait()
@@ -1450,15 +1674,19 @@ class FlashAttentionBackwardSm90:
                 store_dK()
                 cute.arch.cp_async_bulk_commit_group()
         else:
-            sdKaccum_shape0 = self.tile_n * self.tile_hdim // self.num_mma_warp_groups
-            sdVaccum_shape0 = self.tile_n * self.tile_hdimv // self.num_mma_warp_groups
-            sdKaccum_layout = cute.make_layout((sdKaccum_shape0, self.num_mma_warp_groups))
-            sdVaccum_layout = cute.make_layout((sdVaccum_shape0, self.num_mma_warp_groups))
             head_idx_kv = head_idx // qhead_per_kvhead_divmod
-            mdKaccum_cur = mdK[None, head_idx_kv, batch_idx]
             gdKaccum_ = cute.local_tile(mdKaccum_cur, (self.tile_n * self.tile_hdim,), (n_block,))
             gdKaccum = cute.flat_divide(gdKaccum_, (sdKaccum_shape0,))
-            mdVaccum_cur = mdV[None, head_idx_kv, batch_idx]
             gdVaccum_ = cute.local_tile(mdVaccum_cur, (self.tile_n * self.tile_hdimv,), (n_block,))
             gdVaccum = cute.flat_divide(gdVaccum_, (sdVaccum_shape0,))
             # These two overlap each other
@@ -1467,7 +1695,7 @@ class FlashAttentionBackwardSm90:
             sdVaccum = cute.make_tensor(sVaccum_ptr, sdVaccum_layout)
             tiled_copy_dKVaccum_r2s = cute.make_tiled_copy_tv(
                 cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
-                cute.make_layout((self.num_threads_per_warp_group, self.num_mma_warp_groups)),
                 cute.make_layout(128 // Float32.width),
             )
             thr_copy_dKVaccum_r2s = tiled_copy_dKVaccum_r2s.get_slice(tidx)
@@ -1482,11 +1710,11 @@ class FlashAttentionBackwardSm90:
             epi_barrier.arrive_and_wait()
             if warp_idx == 4:
                 with cute.arch.elect_one():
-                    for wg_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
                         copy_utils.cpasync_reduce_bulk_add_f32(
                             sdKaccum[None, wg_idx].iterator,
                             gdKaccum[None, wg_idx].iterator,
-                            self.tma_copy_bytes["dKacc"] // self.num_mma_warp_groups,
                         )
                 cute.arch.cp_async_bulk_commit_group()
@@ -1498,11 +1726,11 @@ class FlashAttentionBackwardSm90:
             epi_barrier.arrive_and_wait()
             if warp_idx == 4:
                 with cute.arch.elect_one():
-                    for wg_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
                         copy_utils.cpasync_reduce_bulk_add_f32(
                             sdVaccum[None, wg_idx].iterator,
                             gdVaccum[None, wg_idx].iterator,
-                            self.tma_copy_bytes["dVacc"] // self.num_mma_warp_groups,
                         )
                 cute.arch.cp_async_bulk_commit_group()
@@ -1515,21 +1743,45 @@ class FlashAttentionBackwardSm90:
         TileSchedulerCls: cutlass.Constexpr[Callable],
         SeqlenInfoCls: cutlass.Constexpr[Callable],
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
     ):
         tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             n_block, head_idx, batch_idx, _ = work_tile.tile_idx
             seqlen = SeqlenInfoCls(batch_idx)
-            mdQaccum_cur = mdQaccum[None, head_idx, batch_idx]
-            gdQaccum_ = cute.local_tile(mdQaccum_cur, (self.tile_m * self.tile_hdim,), (None,))
-            # (M * K / WG, WG, _)
-            gdQaccum = cute.flat_divide(
-                gdQaccum_, (self.tile_m * self.tile_hdim // self.num_mma_warp_groups,)
             )
             m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
             if const_expr(not self.use_block_sparsity):
-                process_tile = const_expr(not self.is_local) or m_block_min < m_block_max
                 loop_count = m_block_max - m_block_min
             else:
                 total_block_cnt = get_total_q_block_count_bwd(
@@ -1548,17 +1800,36 @@ class FlashAttentionBackwardSm90:
                         m_block = m_block_min + iter_idx
                         m_block_safe = m_block
-                        for warp_group_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
-                            cute.arch.cp_async_bulk_wait_group(
-                                self.num_mma_warp_groups - 1 - warp_group_idx, read=True
-                            )
                             cute.arch.barrier_arrive(
                                 barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
                                 number_of_threads=self.num_threads_per_warp_group
                                 + cute.arch.WARP_SIZE,
                             )
-                        for warp_group_idx in cutlass.range_constexpr(self.num_mma_warp_groups):
                             cute.arch.barrier(
                                 barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
                                 number_of_threads=self.num_threads_per_warp_group
@@ -1567,11 +1838,24 @@ class FlashAttentionBackwardSm90:
                             with cute.arch.elect_one():
                                 copy_utils.cpasync_reduce_bulk_add_f32(
                                     sdQaccum[None, warp_group_idx].iterator,
-                                    gdQaccum[None, warp_group_idx, m_block_safe].iterator,
                                     self.tma_copy_bytes["dQ"],
                                 )
                             cute.arch.cp_async_bulk_commit_group()
                 else:
                     dQaccum_store_block_sparse_bwd_sm90(
                         blocksparse_tensors,
                         batch_idx,
@@ -1581,11 +1865,27 @@ class FlashAttentionBackwardSm90:
                         gdQaccum,
                         subtile_factor=self.subtile_factor,
                         m_block_max=m_block_max,
-                        num_mma_warp_groups=self.num_mma_warp_groups,
                         num_threads_per_warp_group=self.num_threads_per_warp_group,
                         tma_copy_bytes_dQ=self.tma_copy_bytes["dQ"],
                     )
             tile_scheduler.advance_to_next_work()
             work_tile = tile_scheduler.get_current_work()
-        cute.arch.cp_async_bulk_wait_group(0, read=True)

 from .block_info import BlockInfo
 from . import pipeline
 from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import (
+    TileSchedulerArguments,
+    SingleTileScheduler,
+    SingleTileLPTBwdScheduler,
+    SingleTileVarlenScheduler,
+)
+from . import barrier
 from .named_barrier import NamedBarrierBwd
 from .softmax import apply_score_mod_inner, apply_score_mod_bwd_inner
 from .block_sparsity import BlockSparseTensors
         head_dim_v: Optional[int] = None,
         qhead_per_kvhead: int = 1,
         is_causal: bool = False,
+        is_local: bool = False,
+        deterministic: bool = False,
         tile_m: int = 64,
         tile_n: int = 128,
         Q_stage: int = 2,
         mask_mod: cutlass.Constexpr | None = None,
         has_aux_tensors: cutlass.Constexpr = False,
         subtile_factor: cutlass.Constexpr[int] = 1,
+        dQ_single_wg: bool = False,
     ):
         self.dtype = dtype
         # padding head_dim to a multiple of 16 as k_block_size
         self.check_hdim_v_oob = head_dim_v != self.tile_hdimv
         self.qhead_per_kvhead = qhead_per_kvhead
         self.is_causal = is_causal
+        self.is_local = is_local
+        self.deterministic = deterministic
         self.tile_m = tile_m
         self.tile_n = tile_n
         self.num_threads = num_threads
         self.AtomLayoutMSdP = AtomLayoutMSdP
         self.AtomLayoutNdKV = AtomLayoutNdKV
         self.AtomLayoutMdQ = AtomLayoutMdQ
+        self.num_wg_mma = (self.num_threads // 128) - 1
         self.mma_dkv_is_rs = (
             AtomLayoutMSdP == 1
+            and AtomLayoutNdKV == self.num_wg_mma
             and SdP_swapAB
             and not dKV_swapAB
         )
         self.V_in_regs = V_in_regs
+        # May be overridden in __call__ for varlen inputs.
         if qhead_per_kvhead > 1:
             assert self.same_hdim_kv, "GQA backward requires head_dim == head_dim_v"
+            assert self.num_wg_mma == 2, "GQA backward assumes 2 warp groups"
         # These are tuned for speed
         # Do we keep the LSE and dPsum in each thread, or split them across 8 threads that share
         # them and then shuffle to get the value whenever we need? This can reduce register
         # pressure when SdP_swapAB, where each thread needs to keep statistics for (kBlockM / 4)
         # rows. If !SdP_swapAB, each thread only needs to keep statistics for 2 rows.
         self.shuffle_LSE = self.SdP_swapAB and self.tile_hdim <= 64
         self.shuffle_dPsum = self.SdP_swapAB and self.tile_hdim <= 64
         else:
             self.vec_size: cutlass.Constexpr = 4
         self.qk_acc_dtype = Float32
+        # dQ_single_wg: WG0 computes the full dQ GEMM, WG1 skips it.
+        # Only valid for 2 MMA warp groups.
+        # Credit: Ben Spector
+        if dQ_single_wg:
+            assert self.num_wg_mma == 2, "dQ_single_wg only supports 2 warp groups"
+        self.num_wg_dQ = 1 if dQ_single_wg else self.num_wg_mma
     @staticmethod
     def can_implement(
         assert mQ_type == self.dtype
     def _setup_attributes(self):
+        # We need to accommodate both Q and Q^T (and dO and dO^T) in shared memory.
+        # Q & dO are used in the SdP Mma and Q^T and dO^T are used in the dKV Mma.
+        # The M dimension (tile_m) doesn't matter for the layout, only the K dimension
+        wg_d_dKV = self.num_wg_mma // self.AtomLayoutNdKV
+        self.sQ_layout, self.sdO_layout = [
+            # Need to set major_mode_size (mms) to accommodate Q and Q.T
+            sm90_utils.make_smem_layout(self.dtype, LayoutEnum.ROW_MAJOR, shape, stage, mms)
+            for shape, stage, mms in [
+                ((self.tile_m, self.tile_hdim), self.Q_stage, self.tile_hdim // wg_d_dKV),
+                ((self.tile_m, self.tile_hdimv), self.dO_stage, self.tile_hdim // wg_d_dKV),
             ]
         ]
+        wg_d_dQ = self.num_wg_dQ // self.AtomLayoutMdQ
+        # Accomodate both K and K.T
+        self.sK_layout = sm90_utils.make_smem_layout(
+            self.dtype,
+            LayoutEnum.ROW_MAJOR,
+            (self.tile_n, self.tile_hdim),
+            stage=None,
+            major_mode_size=self.tile_hdim // wg_d_dQ,
+        )
+        # There's only V, no V.T, so layout is normal
+        self.sV_layout = sm90_utils.make_smem_layout(
+            self.dtype, LayoutEnum.ROW_MAJOR, (self.tile_n, self.tile_hdimv), None
+        )
+        # Accomodate both S and S.T
+        wg_n_SdP = self.num_wg_mma // self.AtomLayoutMSdP
+        wg_n_dKV = self.AtomLayoutNdKV
+        self.sPdS_layout = sm90_utils.make_smem_layout(
+            self.dtype,
+            LayoutEnum.ROW_MAJOR,
+            (self.tile_m, self.tile_n),
+            stage=self.PdS_stage,
+            major_mode_size=math.gcd(self.tile_n // wg_n_SdP, self.tile_n // wg_n_dKV),
+        )
         self.sdQaccum_layout = cute.make_layout(
+            (self.tile_m * self.tile_hdim // self.num_wg_dQ, self.num_wg_dQ)
         )
         # dQaccum R->S
         self.r2s_tiled_copy_dQaccum = cute.make_tiled_copy_tv(
             cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
             # thr_layout
+            cute.make_layout((self.num_threads_per_warp_group, self.num_wg_dQ)),
             cute.make_layout(128 // Float32.width),  # val_layout
         )
         # dKVaccum for GQA epilogue - reuses sV+sK memory recast as f32
         # TODO: assert that sVaccum and sKaccum don't overflow smem
     def _get_tiled_mma(self):
+        maybe_swap_mn = lambda shape, swap: (shape[1], shape[0], *shape[2:]) if swap else shape
         # S = Q @ K.T, dP = dO @ V.T
+        atom_layout_SdP = (self.AtomLayoutMSdP, self.num_wg_mma // self.AtomLayoutMSdP, 1)
         tiler_mn_SdP = (self.tile_m // atom_layout_SdP[0], self.tile_n // atom_layout_SdP[1])
         tiled_mma_SdP = sm90_utils_basic.make_trivial_tiled_mma(
             self.dtype,
             warpgroup.OperandMajorMode.K,
             warpgroup.OperandMajorMode.K,
             Float32,
+            atom_layout_mnk=maybe_swap_mn(atom_layout_SdP, self.SdP_swapAB),
+            tiler_mn=(64, tiler_mn_SdP[1] if not self.SdP_swapAB else tiler_mn_SdP[0]),
         )
         # dV = P.T @ dO, dK = dS.T @ Q
+        atom_layout_dKV = (self.AtomLayoutNdKV, self.num_wg_mma // self.AtomLayoutNdKV, 1)
         tiler_mn_dK = (self.tile_n // atom_layout_dKV[0], self.tile_hdim // atom_layout_dKV[1])
         tiler_mn_dV = (self.tile_n // atom_layout_dKV[0], self.tile_hdimv // atom_layout_dKV[1])
         tiled_mma_dK, tiled_mma_dV = [
                 else warpgroup.OperandMajorMode.K,
                 warpgroup.OperandMajorMode.MN,
                 Float32,
+                atom_layout_mnk=maybe_swap_mn(atom_layout_dKV, self.dKV_swapAB),
+                tiler_mn=(64, tiler_mn_d[1] if not self.dKV_swapAB else tiler_mn_d[0]),
                 a_source=warpgroup.OperandSource.RMEM
                 if self.mma_dkv_is_rs
                 else warpgroup.OperandSource.SMEM,
             for tiler_mn_d in (tiler_mn_dK, tiler_mn_dV)
         ]
         # dQ = dS @ K
+        assert self.num_wg_dQ % self.AtomLayoutMdQ == 0
+        atom_layout_dQ = (self.AtomLayoutMdQ, self.num_wg_dQ // self.AtomLayoutMdQ, 1)
         tiler_mn_dQ = (self.tile_m // atom_layout_dQ[0], self.tile_hdim // atom_layout_dQ[1])
         tiled_mma_dQ = sm90_utils_basic.make_trivial_tiled_mma(
             self.dtype,
             warpgroup.OperandMajorMode.K if not self.dQ_swapAB else warpgroup.OperandMajorMode.MN,
             warpgroup.OperandMajorMode.MN if not self.dQ_swapAB else warpgroup.OperandMajorMode.K,
             Float32,
+            atom_layout_mnk=maybe_swap_mn(atom_layout_dQ, self.dQ_swapAB),
+            tiler_mn=(64, tiler_mn_dQ[1] if not self.dQ_swapAB else tiler_mn_dQ[0]),
         )
         return tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ
         mdK: cute.Tensor,
         mdV: cute.Tensor,
         softmax_scale: Float32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         mdV_semaphore: Optional[cute.Tensor] = None,
         aux_tensors: Optional[list] = None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
+        # For GQA (qhead_per_kvhead > 1), multiple Q heads accumulate into the same dK/dV,
+        # so we need the float32 accum path + postprocess.
+        # For varlen_k with qhead_per_kvhead == 1, we use ragged TMA tensors.
+        self.varlen_k = mCuSeqlensK is not None or mSeqUsedK is not None
         self._check_type(
             *(
             )
         )
+        self.is_varlen_q = mCuSeqlensQ is not None or mSeqUsedQ is not None
         mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV = [
             assume_tensor_aligned(t) for t in (mQ, mK, mV, mdO, mLSE, mdPsum, mdQaccum, mdK, mdV)
         ]
+        # Non-varlen inputs are (b, s, n, h), varlen inputs are (s, n, h).
+        # We convert both to a seqlen-major view with head-dim second.
+        # Each tensor may have different rank when Q is padded (seqused_q) but K/V are unpadded (cu_seqlens_k).
+        def _qkv_transpose(t):
+            return layout_utils.select(t, [1, 3, 2, 0] if cute.rank(t.shape) == 4 else [0, 2, 1])
+        mQ, mK, mV, mdO = [_qkv_transpose(t) for t in (mQ, mK, mV, mdO)]
         if const_expr(self.qhead_per_kvhead == 1):
+            mdK, mdV = [_qkv_transpose(t) for t in (mdK, mdV)]
         else:
+            # Accum tensors are (b, n, s*h) for non-varlen and (n, s*h) for varlen.
+            accum_transpose = [2, 1, 0] if cute.rank(mdK.shape) == 3 else [1, 0]
             mdK, mdV = [layout_utils.select(t, accum_transpose) for t in (mdK, mdV)]
+        # Non-varlen stats are (b, n, s), varlen stats are (n, s).
+        LSE_dPsum_dQaccum_transpose = [2, 1, 0] if cute.rank(mLSE.shape) == 3 else [1, 0]
         mLSE, mdPsum, mdQaccum = [
             layout_utils.select(t, LSE_dPsum_dQaccum_transpose) for t in (mLSE, mdPsum, mdQaccum)
         ]
         tiled_mma_SdP, tiled_mma_dK, tiled_mma_dV, tiled_mma_dQ = self._get_tiled_mma()
+        # (batch, num_head, num_m_blocks, cluster_size) -> (num_m_blocks, cluster_size, num_head, batch)
+        if const_expr(self.deterministic):
+            assert mdQ_semaphore is not None
+            mdQ_semaphore = layout_utils.select(mdQ_semaphore, mode=[2, 3, 1, 0])
         self.num_mma_threads = tiled_mma_SdP.size
         assert self.num_mma_threads + 128 == self.num_threads
         self.num_threads_per_warp_group = 128
         self.num_producer_threads = 32
+        REG_LIMIT = 504 if self.num_wg_mma == 2 else 512
+        if const_expr(self.num_wg_mma == 2):
+            if const_expr(self.num_wg_dQ == 1):
+                self.num_mma_regs_wg0 = 256
+                self.num_mma_regs_wg1 = 224
+            else:
+                self.num_mma_regs_wg0 = 240
+                self.num_mma_regs_wg1 = 240
+            self.num_mma_regs = self.num_mma_regs_wg0  # for backward compat
+            self.num_producer_regs = 24
+            assert (
+                self.num_mma_regs_wg0 + self.num_mma_regs_wg1 + self.num_producer_regs <= REG_LIMIT
+            )
+        else:  # 3 warp groups
+            self.num_mma_regs_wg0 = 160
+            self.num_mma_regs_wg1 = 160
+            self.num_mma_regs = 160
+            self.num_producer_regs = 32
+            assert self.num_mma_regs_wg0 * self.num_wg_mma + self.num_producer_regs <= REG_LIMIT
         self._setup_attributes()
         SharedStorage = self._get_shared_storage_cls()
         self.tma_copy_bytes["LSE"] = self.tile_m * Float32.width // 8
         self.tma_copy_bytes["dPsum"] = self.tile_m * Float32.width // 8
         self.tma_copy_bytes["dQ"] = (
+            self.tile_m * self.tile_hdim * Float32.width // 8 // self.num_wg_dQ
         )
         self.tma_copy_bytes["dKacc"] = self.tile_n * self.tile_hdim * Float32.width // 8
         self.tma_copy_bytes["dVacc"] = self.tile_n * self.tile_hdimv * Float32.width // 8
             (self.tile_m, self.tile_hdimv),
         )
         if const_expr(self.qhead_per_kvhead == 1):
+            mdK_tma = (
+                copy_utils.create_ragged_tensor_for_tma(mdK, ragged_dim=0, ptr_shift=True)
+                if self.varlen_k
+                else mdK
+            )
+            mdV_tma = (
+                copy_utils.create_ragged_tensor_for_tma(mdV, ragged_dim=0, ptr_shift=True)
+                if self.varlen_k
+                else mdV
+            )
             tma_atom_dK, tma_tensor_dK = cpasync.make_tiled_tma_atom(
                 cpasync.CopyBulkTensorTileS2GOp(),
+                mdK_tma,
                 cute.select(self.sK_layout, mode=[0, 1]),
                 (self.tile_n, self.tile_hdim),
             )
             tma_atom_dV, tma_tensor_dV = cpasync.make_tiled_tma_atom(
                 cpasync.CopyBulkTensorTileS2GOp(),
+                mdV_tma,
                 cute.select(self.sV_layout, mode=[0, 1]),
                 (self.tile_n, self.tile_hdimv),
             )
         else:
             tma_atom_dK = tma_atom_dV = tma_tensor_dK = tma_tensor_dV = None
+        if const_expr(mCuSeqlensK is not None or mSeqUsedK is not None):
+            TileScheduler = SingleTileVarlenScheduler
+        elif const_expr(self.deterministic):
+            TileScheduler = SingleTileLPTBwdScheduler
+        else:
+            TileScheduler = SingleTileScheduler
+        self.spt = (self.is_causal or self.is_local) and self.deterministic
         tile_sched_args = TileSchedulerArguments(
             cute.ceil_div(cute.size(mK.shape[0]), self.tile_n),
             cute.size(mQ.shape[2]),
+            cute.size(mK.shape[3])
+            if const_expr(mCuSeqlensK is None)
+            else cute.size(mCuSeqlensK.shape[0] - 1),  # num_batch
             1,  # num_splits
+            cute.size(mQ.shape[0]),  # pass seqlen_q or total_q for seqlen_k
+            mQ.shape[1],  # headdim
+            mV.shape[1],  # headdim_v
+            total_q=cute.size(mK.shape[0])
+            if const_expr(mCuSeqlensK is not None)
+            else cute.size(mK.shape[0]) * cute.size(mK.shape[3]),
+            tile_shape_mn=(self.tile_n, self.tile_m),  # Swapping the role of Q & K
+            mCuSeqlensQ=mCuSeqlensK,
+            mSeqUsedQ=mSeqUsedK,
             qhead_per_kvhead_packgqa=1,
             element_size=self.dtype.width // 8,
             is_persistent=False,
+            lpt=self.spt,
+            head_swizzle=self.deterministic,
         )
         tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
         self.use_block_sparsity = cutlass.const_expr(blocksparse_tensors is not None)
+        if const_expr(window_size_left is not None):
+            window_size_left = Int32(window_size_left)
+        if const_expr(window_size_right is not None):
+            window_size_right = Int32(window_size_right)
         self.kernel(
             tma_tensor_Q,
             tma_tensor_K,
             mLSE,
             mdPsum,
             mdQaccum,
+            mCuSeqlensQ,
+            mCuSeqlensK,
+            mSeqUsedQ,
+            mSeqUsedK,
             self.sQ_layout,
             self.sK_layout,
             self.sV_layout,
             fastdiv_mods,
             blocksparse_tensors,
             qhead_per_kvhead_divmod,
+            mdQ_semaphore,
+            window_size_left,
+            window_size_right,
         ).launch(
             grid=grid_dim,
             block=[self.num_threads, 1, 1],
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=True,
         )
     @cute.kernel
         mLSE: cute.Tensor,
         mdPsum: cute.Tensor,
         mdQaccum: cute.Tensor,
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mCuSeqlensK: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        mSeqUsedK: Optional[cute.Tensor],
         sQ_layout: cute.ComposedLayout,
         sK_layout: cute.ComposedLayout,
         sV_layout: cute.ComposedLayout,
         fastdiv_mods=(None, None),
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+        mdQ_semaphore: Optional[cute.Tensor] = None,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
     ):
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         # prefetch TMA descriptors
         if warp_idx == 0:
+            for atom in [tma_atom_Q, tma_atom_K, tma_atom_V, tma_atom_dO, tma_atom_dK, tma_atom_dV]:
+                if const_expr(atom is not None):
+                    cpasync.prefetch_descriptor(atom)
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(SharedStorage)
             self.is_causal,
             self.is_local,
             False,  # is_split_kv
+            window_size_left,
+            window_size_right,
             qhead_per_kvhead_packgqa=1,
         )
         SeqlenInfoCls = partial(
             SeqlenInfoQK.create,
             seqlen_q_static=mQ.shape[0],
             seqlen_k_static=mK.shape[0],
+            mCuSeqlensQ=mCuSeqlensQ,
+            mCuSeqlensK=mCuSeqlensK,
+            mSeqUsedQ=mSeqUsedQ,
+            mSeqUsedK=mSeqUsedK,
+            tile_m=self.tile_m,
+            tile_n=self.tile_n,
         )
         AttentionMaskCls = partial(
             AttentionMask,
             self.tile_m,
             self.tile_n,
+            window_size_left=window_size_left,
+            window_size_right=window_size_right,
             swap_AB=self.SdP_swapAB,
         )
         TileSchedulerCls = partial(TileScheduler.create, tile_sched_params)
                     TileSchedulerCls,
                     SeqlenInfoCls,
                     blocksparse_tensors,
+                    mdQ_semaphore,
                 )
         else:
             tidx, _, _ = cute.arch.thread_idx()
             tidx = tidx - 128
+            mma_args = (
                 tiled_mma_SdP,
                 tiled_mma_dK,
                 tiled_mma_dV,
                 blocksparse_tensors,
                 qhead_per_kvhead_divmod,
             )
+            if const_expr(self.num_wg_dQ == self.num_wg_mma):
+                # Both WGs compute dQ
+                cute.arch.setmaxregister_increase(self.num_mma_regs_wg0)
+                self.mma(*mma_args, is_dQ_wg=True)
+            else:
+                # WG0 computes dQ, WG1 skips it
+                warp_idx_in_mma = cute.arch.make_warp_uniform(cute.arch.warp_idx()) - 4
+                if warp_idx_in_mma < 4:
+                    cute.arch.setmaxregister_increase(self.num_mma_regs_wg0)
+                    self.mma(*mma_args, is_dQ_wg=True)
+                else:
+                    cute.arch.setmaxregister_increase(self.num_mma_regs_wg1)
+                    self.mma(*mma_args, is_dQ_wg=False)
     @cute.jit
     def load(
                     if const_expr(self.qhead_per_kvhead == 1)
                     else head_idx // qhead_per_kvhead_divmod
                 )
+                mK_cur = seqlen.offset_batch_K(mK, batch_idx, dim=3)[None, None, head_idx_kv]
+                mV_cur = seqlen.offset_batch_K(mV, batch_idx, dim=3)[None, None, head_idx_kv]
                 gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
                 gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
+                mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
+                mLSE_cur = seqlen.offset_batch_Q(mLSE, batch_idx, dim=2, padded=True)[
+                    None, head_idx
+                ]
+                mdO_cur = seqlen.offset_batch_Q(mdO, batch_idx, dim=3)[None, None, head_idx]
+                mdPsum_cur = seqlen.offset_batch_Q(mdPsum, batch_idx, dim=2, padded=True)[
+                    None, head_idx
+                ]
                 gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (None, 0))
                 gdO = cute.local_tile(mdO_cur, (self.tile_m, self.tile_hdimv), (None, 0))
                 gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (None,))
                 gdPsum = cute.local_tile(mdPsum_cur, (self.tile_m,), (None,))
                 load_K, _, _ = copy_utils.tma_get_copy_fn(
                 if const_expr(not self.use_block_sparsity):
                     total_m_block_cnt = m_block_max - m_block_min
+                    process_tile = (
+                        const_expr(not self.is_local and not self.is_varlen_q)
+                        or m_block_min < m_block_max
+                    )
                 else:
                     total_m_block_cnt = get_total_q_block_count_bwd(
                         blocksparse_tensors,
                         )
                         load_K(tma_bar_ptr=pipeline_Q.producer_get_barrier(producer_state_Q))
                         load_Q(first_m_block, producer_state=producer_state_Q)
+                        # Wait for bwd preprocess to finish writing LSE and dPsum
+                        cute.arch.griddepcontrol_wait()
                         load_LSE(first_m_block, producer_state=producer_state_Q)
                         producer_state_dO_cur = (
                             producer_state_dO
         fastdiv_mods=(None, None),
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         qhead_per_kvhead_divmod: Optional[FastDivmodDivisor] = None,
+        is_dQ_wg: cutlass.Constexpr[bool] = True,
     ):
         warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
         warp_group_thread_layout = cute.make_layout(
+            self.num_wg_mma, stride=self.num_threads_per_warp_group
         )
         thr_mma_SdP = tiled_mma_SdP.get_slice(tidx)
         wg_mma_SdP = tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx))
         wg_mma_dK = tiled_mma_dK.get_slice(warp_group_thread_layout(warp_group_idx))
         wg_mma_dV = tiled_mma_dV.get_slice(warp_group_thread_layout(warp_group_idx))
+        wg_mma_dQ = None
+        if const_expr(is_dQ_wg):
+            wg_idx_dQ = warp_group_idx if const_expr(self.num_wg_dQ > 1) else 0
+            wg_mma_dQ = tiled_mma_dQ.get_slice(warp_group_thread_layout(wg_idx_dQ))
         # S = Q @ K.T
         shape_mnk_S = (self.tile_m, self.tile_n, self.tile_hdim)
         _, tSrQ, tSrK = sm90_utils.partition_fragment_ABC(
         # dQ = dS @ K
         sKt = layout_utils.transpose_view(sK)
         shape_mnk_dQ = (self.tile_m, self.tile_hdim, self.tile_n)
+        mma_dsk_fn = None
+        if const_expr(is_dQ_wg):
+            _, tdQrdS, tdQrKt = sm90_utils.partition_fragment_ABC(
+                wg_mma_dQ, shape_mnk_dQ, sdS, sKt, swap_AB=self.dQ_swapAB
+            )
+            mma_dsk_fn = partial(
+                gemm_zero_init,
+                tiled_mma_dQ,
+                shape_mnk_dQ[:2],
+                tdQrdS,
+                tdQrKt,
+                swap_AB=self.dQ_swapAB,
+            )
+        # Smem copy atom tiling for P/dS R2S
         copy_P_r2s = None
+        mms_PdS = self.tile_n // (self.num_wg_mma // self.AtomLayoutMSdP)
         if const_expr(sP is not None):
             sP_cpy = sP if const_expr(not self.SdP_swapAB) else sPt
             copy_P_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_SdP,
+                sP_cpy,
+                tidx,
+                self.arch,
+                transpose=self.SdP_swapAB,
+                position_independent=True,
+                major_mode_size=mms_PdS,
             )
         sdS_cpy = sdS if const_expr(not self.SdP_swapAB) else sdSt
         copy_dS_r2s, _, _ = copy_utils.get_smem_store_C(
+            tiled_mma_SdP,
+            sdS_cpy,
+            tidx,
+            self.arch,
+            transpose=self.SdP_swapAB,
+            position_independent=True,
+            major_mode_size=mms_PdS,
         )
         tLSEsLSE = layout_utils.mma_partition_C_vec(
         tLSEsdPsum = layout_utils.mma_partition_C_vec(
             sdPsum, thr_mma_SdP, expand_shape=self.tile_n, is_colvec=not self.SdP_swapAB
         )
+        # When shuffle=True, rows are distributed across 8 quads (4 threads each) within a warp.
+        # Each thread loads only ceil(num_rows/8) values;
+        shfl_copy = copy_utils.tiled_copy_1d(sLSE.element_type, num_threads=8, num_copy_elems=2)
+        if const_expr(self.shuffle_LSE):
+            tLSEsLSE = shfl_copy.get_slice(cute.arch.lane_idx() // 4).partition_S(tLSEsLSE)
+            # ((2, 1), 1, 2) -> (((2, 1), 1), 2)
+            tLSEsLSE = cute.group_modes(tLSEsLSE, 0, 2)
+        if const_expr(self.shuffle_dPsum):
+            tLSEsdPsum = shfl_copy.get_slice(cute.arch.lane_idx() // 4).partition_S(tLSEsdPsum)
+            tLSEsdPsum = cute.group_modes(tLSEsdPsum, 0, 2)
+        tdQsdQaccum = None
+        if const_expr(is_dQ_wg):
+            smem_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_slice(tidx)
+            tdQsdQaccum = smem_thr_copy_dQaccum.partition_D(sdQaccum)
         PdS_barrier = cutlass.pipeline.NamedBarrier(
             barrier_id=int(NamedBarrierBwd.PdS), num_threads=self.num_mma_threads
             PdS_barrier=PdS_barrier,
             # acc_dV=acc_dV,
             # acc_dK=acc_dK,
+            is_dQ_wg=is_dQ_wg,
         )
         consumer_state_Q = cutlass.pipeline.make_pipeline_state(
             m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
             if const_expr(not self.use_block_sparsity):
+                process_tile = (
+                    const_expr(not self.is_local and not self.is_varlen_q)
+                    or m_block_min < m_block_max
+                )
             else:
                 total_m_block_cnt = get_total_q_block_count_bwd(
                     blocksparse_tensors,
                     qhead_per_kvhead_divmod,
                 )
             else:
+                # KV tile with zero Q blocks produces no dK/dV; write zeros.
+                if const_expr(self.use_block_sparsity or self.is_local or self.is_varlen_q):
                     acc_dK.fill(0.0)
                     acc_dV.fill(0.0)
                     self.epilogue_dKV(
         if warp_idx == 4:
             cute.arch.cp_async_bulk_wait_group(0, read=True)
+    @staticmethod
+    @cute.jit
+    def _get_stat(tSrS: cute.Tensor, row: Int32, lane: Int32, shuffle: bool) -> Float32:
+        """Retrieve the statistic for a given accumulator row.
+        When shuffle=False, direct register indexing.
+        When shuffle=True, warp shuffle from the thread group that holds the value.
+        """
+        if const_expr(not shuffle):
+            return tSrS[row]
+        # tSrS: (((2, 1), 1), 1)), distributed across 8 threads in the warp
+        vecsize = cute.size(tSrS, mode=[0, 0])  # 2
+        idx0, off, idx1 = cute.idx2crd(row, (vecsize, 8, cute.shape(tSrS, mode=[0, 1])))
+        # register index: 0, 1, 0, 1, ..., 2, 3, 2, 3, ...
+        return utils.shuffle_sync(tSrS[idx0 + idx1 * vecsize], offset=off * 4 + (lane % 4))
     @cute.jit
     def mma_one_m_block(
         self,
         pipeline_dO: cutlass.pipeline.PipelineAsync,
         tLSEsLSE: cute.Tensor,
         tLSEsdPsum: cute.Tensor,
+        tdQsdQaccum: Optional[cute.Tensor],
         softmax_scale_log2: Float32,
         PdS_barrier: cutlass.pipeline.NamedBarrier,
+        is_dQ_wg: cutlass.Constexpr[bool] = True,
         mask_fn: Optional[Callable] = None,
         score_mod_fn: Optional[Callable] = None,
         score_mod_bwd_fn: Optional[Callable] = None,
         dKV_accumulate: Boolean = True,
     ):
         consumer_state_dO_cur = (
+            consumer_state_Q if const_expr(self.Q_stage == self.dO_stage) else consumer_state_dO
         )
         smem_idx_Q = consumer_state_Q.index
         smem_idx_dO = consumer_state_dO_cur.index if const_expr(self.dO_stage > 1) else 0
         # (1) [GEMM 1] S = Q @ K^T
         pipeline_Q.consumer_wait(consumer_state_Q, pipeline_Q.consumer_try_wait(consumer_state_Q))
         acc_S = mma_qk_fn(A_idx=smem_idx_Q, wg_wait=-1)
+        # If shuffle_LSE, OOB reads are OK since sLSE is already padded
         tLSErLSE = copy_utils.load_s2r(tLSEsLSE[None, smem_idx_Q])
         # (2) [GEMM 2] dP = dO @ V.T
         pipeline_dO.consumer_wait(
         if cutlass.const_expr(mask_fn is not None):
             mask_fn(acc_S, m_block=m_block)
         acc_S_mn = layout_utils.reshape_acc_to_mn(acc_S, transpose=self.SdP_swapAB)
+        lane_idx = cute.arch.lane_idx()
         for r in cutlass.range_constexpr(cute.size(acc_S_mn, mode=[0])):
+            lse_val = self._get_stat(tLSErLSE, r, lane_idx, shuffle=self.shuffle_LSE)
             for c in cutlass.range(cute.size(acc_S_mn, mode=[1]), unroll_full=True):
                 acc_S_mn[r, c] = cute.math.exp2(
+                    acc_S_mn[r, c] * softmax_scale_log2 - lse_val, fastmath=True
                 )
         tLSErdPsum = copy_utils.load_s2r(tLSEsdPsum[None, smem_idx_dO])
         warpgroup.wait_group(0)
         acc_dP_mn = layout_utils.reshape_acc_to_mn(acc_dP, transpose=self.SdP_swapAB)
         for r in cutlass.range_constexpr(cute.size(acc_dP_mn, mode=[0])):
+            dpsum_val = self._get_stat(tLSErdPsum, r, lane_idx, shuffle=self.shuffle_dPsum)
             for c in cutlass.range(cute.size(acc_dP_mn, mode=[1]), unroll_full=True):
+                acc_dP_mn[r, c] = acc_S_mn[r, c] * (acc_dP_mn[r, c] - dpsum_val)
         if const_expr(self.score_mod_bwd is not None):
             score_mod_bwd_fn(acc_dP, acc_S_pre, m_block=m_block)
         # smem fence to make sure sdS is written before it's read by WGMMA
         cute.arch.fence_view_async_shared()
         PdS_barrier.arrive_and_wait()
+        if const_expr(is_dQ_wg):
+            # (6) [GEMM 4] dQ = dS @ K
+            acc_dQ = mma_dsk_fn(A_idx=smem_idx_PdS, wg_wait=1)
+            pipeline_dO.consumer_release(consumer_state_dO_cur)  # release dO as dV mma is done
+            # (7) [GEMM 5] dK += dS.T @ Q
+            if const_expr(not self.mma_dkv_is_rs):
+                mma_dsq_fn(
+                    A_idx=smem_idx_PdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1
+                )
+            else:
+                mma_dsq_fn(tCrA=tdKrdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1)
+            # dQ R2S: wait for dQaccum_store to free the smem buffer, then write dQ to smem
+            # When dQ_single_wg, only WG0 enters here so warp_group_idx == 0
+            cute.arch.barrier(
+                barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
+                number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
+            )
+            tdQrdQaccum_flat = cute.make_tensor(
+                acc_dQ.iterator, cute.make_layout(tdQsdQaccum.shape)
+            )
+            cute.autovec_copy(tdQrdQaccum_flat, tdQsdQaccum)
+            cute.arch.fence_view_async_shared()
+            cute.arch.barrier_arrive(
+                barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
+                number_of_threads=self.num_threads_per_warp_group + cute.arch.WARP_SIZE,
+            )
+            warpgroup.wait_group(0)
+            pipeline_Q.consumer_release(consumer_state_Q)
+        else:
+            # dQ_single_wg: WG1 skips dQ, only does dV wait + dK
+            # (7) [GEMM 5] dK += dS.T @ Q
+            if const_expr(not self.mma_dkv_is_rs):
+                mma_dsq_fn(
+                    A_idx=smem_idx_PdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1
+                )
+            else:
+                mma_dsq_fn(tCrA=tdKrdS, B_idx=smem_idx_Q, zero_init=not dKV_accumulate, wg_wait=1)
+            pipeline_dO.consumer_release(consumer_state_dO_cur)
+            warpgroup.wait_group(0)
+            pipeline_Q.consumer_release(consumer_state_Q)
         consumer_state_Q.advance()
         consumer_state_dO.advance()
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         if const_expr(self.qhead_per_kvhead == 1):
+            mdK_cur = seqlen.offset_batch_K(mdK, batch_idx, dim=3, ragged=self.varlen_k)[
+                None, None, head_idx
+            ]
+            mdV_cur = seqlen.offset_batch_K(mdV, batch_idx, dim=3, ragged=self.varlen_k)[
+                None, None, head_idx
+            ]
             gdK = cute.local_tile(mdK_cur, (self.tile_n, self.tile_hdim), (n_block, 0))
             gdV = cute.local_tile(mdV_cur, (self.tile_n, self.tile_hdimv), (n_block, 0))
             store_dK, _, _ = copy_utils.tma_get_copy_fn(
             sdV = sV if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sV)
             sdK = sK if const_expr(not self.dKV_swapAB) else layout_utils.transpose_view(sK)
             copy_dV_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_dV,
+                sdV,
+                tidx,
+                self.arch,
+                transpose=self.dKV_swapAB,
+                position_independent=True,
             )
             copy_dK_r2s, _, _ = copy_utils.get_smem_store_C(
+                tiled_mma_dK,
+                sdK,
+                tidx,
+                self.arch,
+                transpose=self.dKV_swapAB,
+                position_independent=True,
             )
             cute.arch.cp_async_bulk_wait_group(1, read=True)
             epi_barrier.arrive_and_wait()
                 store_dK()
                 cute.arch.cp_async_bulk_commit_group()
         else:
+            sdKaccum_shape0 = self.tile_n * self.tile_hdim // self.num_wg_mma
+            sdVaccum_shape0 = self.tile_n * self.tile_hdimv // self.num_wg_mma
+            sdKaccum_layout = cute.make_layout((sdKaccum_shape0, self.num_wg_mma))
+            sdVaccum_layout = cute.make_layout((sdVaccum_shape0, self.num_wg_mma))
             head_idx_kv = head_idx // qhead_per_kvhead_divmod
+            mdKaccum_cur = seqlen.offset_batch_K(
+                mdK, batch_idx, dim=2, padded=True, multiple=self.tile_hdim
+            )[None, head_idx_kv]
+            mdVaccum_cur = seqlen.offset_batch_K(
+                mdV, batch_idx, dim=2, padded=True, multiple=self.tile_hdimv
+            )[None, head_idx_kv]
             gdKaccum_ = cute.local_tile(mdKaccum_cur, (self.tile_n * self.tile_hdim,), (n_block,))
             gdKaccum = cute.flat_divide(gdKaccum_, (sdKaccum_shape0,))
             gdVaccum_ = cute.local_tile(mdVaccum_cur, (self.tile_n * self.tile_hdimv,), (n_block,))
             gdVaccum = cute.flat_divide(gdVaccum_, (sdVaccum_shape0,))
             # These two overlap each other
             sdVaccum = cute.make_tensor(sVaccum_ptr, sdVaccum_layout)
             tiled_copy_dKVaccum_r2s = cute.make_tiled_copy_tv(
                 cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Float32, num_bits_per_copy=128),
+                cute.make_layout((self.num_threads_per_warp_group, self.num_wg_mma)),
                 cute.make_layout(128 // Float32.width),
             )
             thr_copy_dKVaccum_r2s = tiled_copy_dKVaccum_r2s.get_slice(tidx)
             epi_barrier.arrive_and_wait()
             if warp_idx == 4:
                 with cute.arch.elect_one():
+                    for wg_idx in cutlass.range_constexpr(self.num_wg_mma):
                         copy_utils.cpasync_reduce_bulk_add_f32(
                             sdKaccum[None, wg_idx].iterator,
                             gdKaccum[None, wg_idx].iterator,
+                            self.tma_copy_bytes["dKacc"] // self.num_wg_mma,
                         )
                 cute.arch.cp_async_bulk_commit_group()
             epi_barrier.arrive_and_wait()
             if warp_idx == 4:
                 with cute.arch.elect_one():
+                    for wg_idx in cutlass.range_constexpr(self.num_wg_mma):
                         copy_utils.cpasync_reduce_bulk_add_f32(
                             sdVaccum[None, wg_idx].iterator,
                             gdVaccum[None, wg_idx].iterator,
+                            self.tma_copy_bytes["dVacc"] // self.num_wg_mma,
                         )
                 cute.arch.cp_async_bulk_commit_group()
         TileSchedulerCls: cutlass.Constexpr[Callable],
         SeqlenInfoCls: cutlass.Constexpr[Callable],
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        mdQ_semaphore: Optional[cute.Tensor] = None,
     ):
+        tidx, _, _ = cute.arch.thread_idx()
+        # warp-local thread index (dQaccum_store runs on warp 1, global tidx 32-63)
+        warp_local_tidx = tidx % cute.arch.WARP_SIZE
+        read_flag = const_expr(not self.deterministic)
         tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             n_block, head_idx, batch_idx, _ = work_tile.tile_idx
             seqlen = SeqlenInfoCls(batch_idx)
+            if const_expr(not seqlen.has_cu_seqlens_q):
+                mdQaccum_cur = mdQaccum[None, head_idx, batch_idx]
+            else:
+                mdQaccum_cur = cute.domain_offset(
+                    (seqlen.padded_offset_q * self.tile_hdim,), mdQaccum[None, head_idx]
+                )
+            # ((M * K / num_wg_dQ, num_wg_dQ), num_m_blocks)
+            gdQaccum = cute.local_tile(
+                mdQaccum_cur,
+                (
+                    cute.make_layout(
+                        (self.tile_m * self.tile_hdim // self.num_wg_dQ, self.num_wg_dQ)
+                    ),
+                ),
+                (None,),
             )
+            if const_expr(mdQ_semaphore is not None):
+                # mdQ_semaphore is (num_m_blocks, cluster_size, num_head, batch) after transpose
+                mdQ_semaphore_cur = mdQ_semaphore[None, None, head_idx, batch_idx]
             m_block_min, m_block_max = block_info.get_m_block_min_max(seqlen, n_block)
             if const_expr(not self.use_block_sparsity):
+                process_tile = (
+                    const_expr(not self.is_local and not self.is_varlen_q)
+                    or m_block_min < m_block_max
+                )
                 loop_count = m_block_max - m_block_min
             else:
                 total_block_cnt = get_total_q_block_count_bwd(
                         m_block = m_block_min + iter_idx
                         m_block_safe = m_block
+                        num_dQ_chunks = self.num_wg_dQ
+                        for warp_group_idx in cutlass.range_constexpr(num_dQ_chunks):
+                            if const_expr(not self.deterministic):
+                                # If deterministic, we already waited at the end of the prev iter
+                                cute.arch.cp_async_bulk_wait_group(
+                                    num_dQ_chunks - 1 - warp_group_idx, read=read_flag
+                                )
                             cute.arch.barrier_arrive(
                                 barrier_id=int(NamedBarrierBwd.dQEmptyWG0) + warp_group_idx,
                                 number_of_threads=self.num_threads_per_warp_group
                                 + cute.arch.WARP_SIZE,
                             )
+                        # Semaphore acquire: wait for prior n_blocks to finish writing this m_block
+                        if const_expr(self.deterministic):
+                            if const_expr(self.spt):
+                                _, n_block_max_for_m_block = block_info.get_n_block_min_max(
+                                    seqlen, m_block_safe
+                                )
+                                lock_value = n_block_max_for_m_block - 1 - n_block
+                            else:
+                                lock_value = n_block
+                            barrier.wait_eq(
+                                mdQ_semaphore_cur[(m_block_safe, None)].iterator,
+                                warp_local_tidx,
+                                0,  # flag_offset
+                                lock_value,
+                            )
+                        for warp_group_idx in cutlass.range_constexpr(num_dQ_chunks):
                             cute.arch.barrier(
                                 barrier_id=int(NamedBarrierBwd.dQFullWG0) + warp_group_idx,
                                 number_of_threads=self.num_threads_per_warp_group
                             with cute.arch.elect_one():
                                 copy_utils.cpasync_reduce_bulk_add_f32(
                                     sdQaccum[None, warp_group_idx].iterator,
+                                    gdQaccum[(None, warp_group_idx), m_block_safe].iterator,
                                     self.tma_copy_bytes["dQ"],
                                 )
                             cute.arch.cp_async_bulk_commit_group()
+                        # Semaphore release: signal that this n_block is done with this m_block
+                        if const_expr(self.deterministic):
+                            cute.arch.cp_async_bulk_wait_group(0, read=read_flag)
+                            barrier.arrive_inc(
+                                mdQ_semaphore_cur[(m_block_safe, None)].iterator,
+                                warp_local_tidx,
+                                0,  # flag_offset
+                                1,
+                            )
                 else:
+                    assert not self.deterministic, (
+                        "Deterministic not implemented for block-sparse backward"
+                    )
                     dQaccum_store_block_sparse_bwd_sm90(
                         blocksparse_tensors,
                         batch_idx,
                         gdQaccum,
                         subtile_factor=self.subtile_factor,
                         m_block_max=m_block_max,
+                        num_dQ_warp_groups=self.num_wg_dQ,
                         num_threads_per_warp_group=self.num_threads_per_warp_group,
                         tma_copy_bytes_dQ=self.tma_copy_bytes["dQ"],
                     )
+            # For local masking + deterministic (non-spt): signal remaining m_blocks
+            # that this n_block won't visit, so they don't deadlock waiting.
+            if const_expr(
+                self.deterministic and not self.spt and block_info.window_size_left is not None
+            ):
+                m_block_global_max = cute.ceil_div(seqlen.seqlen_q, self.tile_m)
+                for m_block in cutlass.range(m_block_max, m_block_global_max, unroll=1):
+                    barrier.arrive_inc(
+                        mdQ_semaphore_cur[(m_block, None)].iterator,
+                        warp_local_tidx,
+                        0,  # flag_offset
+                        1,
+                    )
             tile_scheduler.advance_to_next_work()
             work_tile = tile_scheduler.get_current_work()
+        if const_expr(not self.deterministic):
+            cute.arch.cp_async_bulk_wait_group(0, read=True)

build/torch-cuda/flash_fwd.py CHANGED Viewed

@@ -15,42 +15,28 @@ import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 from cutlass import Constexpr, Float32, Int32, const_expr, Boolean
-from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 import cutlass.utils as utils_basic
-from cutlass.utils import LayoutEnum
-import cutlass.utils.hopper_helpers as sm90_utils_basic
 from .quack import copy_utils
 from .quack import layout_utils
-from .quack import sm90_utils
 from . import ampere_helpers as sm80_utils
 from .cute_dsl_utils import assume_tensor_aligned
 from . import utils
 from .mask import AttentionMask
-from .softmax import Softmax, apply_score_mod_inner
 from .seqlen_info import SeqlenInfoQK
 from .block_info import BlockInfo
-from .block_sparsity import BlockSparseTensors
-from .block_sparse_utils import (
-    produce_block_sparse_loads,
-    consume_block_sparse_loads,
-)
-from . import pipeline
 from .pack_gqa import PackGQA
 from .named_barrier import NamedBarrierFwd
-from .quack.cute_dsl_utils import ParamsBase
-from .tile_scheduler import (
-    TileSchedulerArguments,
-    SingleTileScheduler,
-    SingleTileLPTScheduler,
-    SingleTileVarlenScheduler,
-)
-from cutlass.cute import FastDivmodDivisor
 class FlashAttentionForwardBase:
-    arch: int = 80
     def __init__(
         self,
@@ -116,6 +102,12 @@ class FlashAttentionForwardBase:
         self.vec_size: cutlass.Constexpr = getattr(
             score_mod, "__vec_size__", 1 if cutlass.const_expr(has_aux_tensors) else 2
         )
     @staticmethod
     def can_implement(
@@ -318,7 +310,8 @@ class FlashAttentionForwardBase:
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
         softmax_scale: Float32,
-        stream: cuda.CUstream,
     ):
         """Configures and launches the flash attention kernel.
@@ -351,7 +344,7 @@ class FlashAttentionForwardBase:
         cute.arch.barrier(
             barrier_id=int(NamedBarrierFwd.Epilogue), number_of_threads=self.num_epilogue_threads
         )
-        smem_copy_atom_O = utils.get_smem_store_atom(self.arch, self.dtype)
         smem_thr_copy_O = cute.make_tiled_copy_C(smem_copy_atom_O, tiled_mma).get_slice(tidx)
         taccOrO = smem_thr_copy_O.retile(rO)
         taccOsO = smem_thr_copy_O.partition_D(sO)
@@ -366,11 +359,7 @@ class FlashAttentionForwardBase:
         # Write LSE from rmem -> gmem
         if const_expr(mLSE is not None):
-            if const_expr(not seqlen.has_cu_seqlens_q):
-                mLSE_cur = mLSE[None, head_idx, batch_idx]
-            else:
-                offset = seqlen.offset_q if const_expr(not self.pack_gqa) else (0, seqlen.offset_q)
-                mLSE_cur = cute.domain_offset((offset,), mLSE[None, head_idx])
             if const_expr(not self.pack_gqa):
                 gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (m_block,))
                 gLSE_expanded_layout = cute.append(
@@ -384,7 +373,7 @@ class FlashAttentionForwardBase:
                 t0accOcO = layout_utils.reshape_acc_to_mn(thr_mma.get_slice(0).partition_C(cO))
                 # Only the thread corresponding to column 0 writes out the lse to gmem
                 if taccOcO[0][1] == 0:
-                    for m in cutlass.range_constexpr(cute.size(taccOgLSE.shape[1])):
                         if (
                             t0accOcO[m, 0][0]
                             < seqlen.seqlen_q - m_block * self.tile_m - taccOcO[0][0]
@@ -393,11 +382,8 @@ class FlashAttentionForwardBase:
             else:
                 pack_gqa.store_LSE(mLSE_cur, lse, tiled_mma, tidx, m_block, seqlen.seqlen_q)
-        if const_expr(not seqlen.has_cu_seqlens_q):
-            mO_cur = mO[None, None, head_idx, batch_idx]
-        else:
-            offset = seqlen.offset_q if const_expr(not self.pack_gqa) else (0, seqlen.offset_q)
-            mO_cur = cute.domain_offset((offset, 0), mO[None, None, head_idx])
         # thr_mma = tiled_mma.get_slice(tidx)
         # taccOgO = thr_mma.partition_C(gO)
         # cute.autovec_copy(rO, taccOgO)
@@ -634,12 +620,19 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         mV: cute.Tensor,
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
-        stream: cuda.CUstream,
-        softmax_scale: Optional[Float32] = None,
         window_size_left: Optional[Int32] = None,
         window_size_right: Optional[Int32] = None,
         learnable_sink: Optional[cute.Tensor] = None,
         aux_tensors=None,
     ):
         """Configures and launches the flash attention kernel.
@@ -648,7 +641,7 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         """
         assert learnable_sink is None, "Learnable sink is not supported in this kernel"
         self._check_type(
-            *(t.element_type if t is not None else None for t in (mQ, mK, mV, mO, mLSE))
         )
         tiled_mma_qk, tiled_mma_pv = self._get_tiled_mma()
         self.num_mma_threads = tiled_mma_pv.size
@@ -656,41 +649,54 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         self.num_Q_load_threads = self.num_threads
         self.num_epilogue_threads = self.num_threads
         # self.use_tma_O = self.arch >= 90 and mCuSeqlensQ is None
-        self.use_tma_O = self.arch >= 90
         self._setup_attributes()
         SharedStorage = self._get_shared_storage_cls()
         mQ, mK, mV, mO = [assume_tensor_aligned(t) for t in (mQ, mK, mV, mO)]
-        mQ, mK, mV, mO = [
-            cute.make_tensor(t.iterator, cute.select(t.layout, mode=[1, 3, 2, 0]))
-            for t in (mQ, mK, mV, mO)
         ]
-        mLSE = cute.make_tensor(mLSE.iterator, cute.select(mLSE.layout, mode=[2, 1, 0]))
-        # grid_dim: (m_block, num_head, batch_size)
-        grid_dim = (
-            cute.ceil_div(mQ.shape[0], self.tile_m),
-            cute.size(mQ.shape[2]),
-            cute.size(mQ.shape[3]),
-        )
-        LOG2_E = math.log2(math.e)
-        if const_expr(self.score_mod is None):
-            softmax_scale_log2 = Float32(softmax_scale * LOG2_E)
-            softmax_scale = None
         else:
-            # NB: If a user passes in a score mod, we want to apply the score-mod in the sm_scaled qk
-            # But in the original base 10. We hijack softmax_scale_log2 to just be the change of base
-            # and correctly apply the softmax_scale prior to score_mod in the softmax step
-            softmax_scale_log2 = Float32(LOG2_E)
-            softmax_scale = Float32(softmax_scale)
-        fastdiv_mods = None
-        if const_expr(aux_tensors is not None):
-            seqlen_q = cute.size(mQ.shape[0]) // (
-                self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1
-            )
-            seqlen_k = cute.size(mK.shape[0])
-            seqlen_q_divmod = FastDivmodDivisor(seqlen_q)
-            seqlen_k_divmod = FastDivmodDivisor(seqlen_k)
-            fastdiv_mods = (seqlen_q_divmod, seqlen_k_divmod)
         self.kernel(
             mQ,
@@ -698,6 +704,10 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
             mV,
             mO,
             mLSE,
             softmax_scale_log2,
             softmax_scale,
             window_size_left,
@@ -714,6 +724,8 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
             tiled_mma_qk,
             tiled_mma_pv,
             SharedStorage,
             aux_tensors,
             fastdiv_mods,
         ).launch(
@@ -731,6 +743,10 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         mV: cute.Tensor,
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
         softmax_scale_log2: Float32,
         softmax_scale: Optional[Float32],
         window_size_left: Optional[Int32],
@@ -747,12 +763,17 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         tiled_mma_qk: cute.TiledMma,
         tiled_mma_pv: cute.TiledMma,
         SharedStorage: cutlass.Constexpr,
         aux_tensors=None,
         fastdiv_mods=None,
     ):
         # Thread index, block index
         tidx, _, _ = cute.arch.thread_idx()
-        m_block, num_head, batch_size = cute.arch.block_idx()
         block_info = BlockInfo(
             self.tile_m,
@@ -764,13 +785,21 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
             window_size_right,
             qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
-        seqlen = SeqlenInfoQK.create(seqlen_q_static=mQ.shape[0], seqlen_k_static=mK.shape[0])
         n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
-        # TODO: return early if n_block_max == 0
-        # if self.is_causal:
-        #     if n_block_max <= 0:
-        #         return
-        n_block = n_block_max - 1
         # ///////////////////////////////////////////////////////////////////////////////
         # Get the appropriate tiles for this thread block.
@@ -778,10 +807,20 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         blkQ_shape = (self.tile_m, self.tile_hdim)
         blkK_shape = (self.tile_n, self.tile_hdim)
         blkV_shape = (self.tile_n, self.tile_hdimv)
-        gQ = cute.local_tile(mQ[None, None, num_head, batch_size], blkQ_shape, (m_block, 0))
         num_head_kv = num_head // self.qhead_per_kvhead
-        gK = cute.local_tile(mK[None, None, num_head_kv, batch_size], blkK_shape, (None, 0))
-        gV = cute.local_tile(mV[None, None, num_head_kv, batch_size], blkV_shape, (None, 0))
         # ///////////////////////////////////////////////////////////////////////////////
         # Get shared memory buffer
@@ -953,18 +992,20 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         mask = AttentionMask(
             self.tile_m,
             self.tile_n,
-            seqlen.seqlen_q,
-            seqlen.seqlen_k,
             window_size_left,
             window_size_right,
             self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
         mask_fn = partial(
             mask.apply_mask,
             m_block=m_block,
             thr_mma=thr_mma_qk,
             mask_causal=self.is_causal,
             mask_local=self.is_local,
             fastdiv_mods=fastdiv_mods if const_expr(self.mask_mod is not None) else None,
         )
@@ -976,8 +1017,8 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
             smem_pipe_read,
             smem_pipe_write,
             is_first_n_block=True,
-            check_inf=True,
-            mask_fn=partial(mask_fn, mask_seqlen=True),
         )
         smem_pipe_read = self.advance_pipeline(smem_pipe_read)
         smem_pipe_write = self.advance_pipeline(smem_pipe_write)
@@ -992,15 +1033,17 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
                     n_block,
                     smem_pipe_read,
                     smem_pipe_write,
-                    check_inf=True,
-                    mask_fn=partial(mask_fn, mask_seqlen=False),
                 )
                 smem_pipe_read = self.advance_pipeline(smem_pipe_read)
                 smem_pipe_write = self.advance_pipeline(smem_pipe_write)
         # The remaining iterations have no masking
         for n_tile in cutlass.range(n_block, unroll=1):
             compute_one_n_block(
-                n_block - n_tile - 1, smem_pipe_read, smem_pipe_write, check_inf=True
             )
             smem_pipe_read = self.advance_pipeline(smem_pipe_read)
             smem_pipe_write = self.advance_pipeline(smem_pipe_write)
@@ -1144,1283 +1187,9 @@ class FlashAttentionForwardSm80(FlashAttentionForwardBase):
         #     load_K_next()
-class FlashAttentionForwardSm90(FlashAttentionForwardBase):
-    arch = 90
-    def __init__(
-        self,
-        *args,
-        intra_wg_overlap: bool = True,
-        mma_pv_is_rs: bool = True,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.intra_wg_overlap = intra_wg_overlap
-        self.mma_pv_is_rs = mma_pv_is_rs
-        self.buffer_align_bytes = 1024
-    def _get_smem_layout_atom(self):
-        sQ_layout_atom = warpgroup.make_smem_layout_atom(
-            sm90_utils_basic.get_smem_layout_atom(LayoutEnum.ROW_MAJOR, self.dtype, self.tile_hdim),
-            self.dtype,
-        )
-        sK_layout_atom = sQ_layout_atom
-        sV_layout_atom = warpgroup.make_smem_layout_atom(
-            sm90_utils_basic.get_smem_layout_atom(
-                LayoutEnum.ROW_MAJOR, self.dtype, self.tile_hdimv
-            ),
-            self.dtype,
-        )
-        sO_layout_atom = sV_layout_atom
-        if not self.mma_pv_is_rs:
-            sP_layout_atom = warpgroup.make_smem_layout_atom(
-                sm90_utils_basic.get_smem_layout_atom(
-                    LayoutEnum.ROW_MAJOR, self.dtype, self.tile_n
-                ),
-                self.dtype,
-            )
-        else:
-            sP_layout_atom = None
-        return sQ_layout_atom, sK_layout_atom, sV_layout_atom, sO_layout_atom, sP_layout_atom
-    def _get_tiled_mma(self):
-        tiled_mma_qk = sm90_utils_basic.make_trivial_tiled_mma(
-            self.dtype,
-            self.dtype,
-            warpgroup.OperandMajorMode.K,
-            warpgroup.OperandMajorMode.K,
-            Float32,
-            atom_layout_mnk=(self.tile_m // 64, 1, 1),  # Might need (1, 2, 1) for hdim 512
-            tiler_mn=(64, self.tile_n),
-        )
-        tiled_mma_pv = sm90_utils_basic.make_trivial_tiled_mma(
-            self.dtype,
-            self.dtype,
-            warpgroup.OperandMajorMode.K,
-            warpgroup.OperandMajorMode.MN,
-            Float32,
-            atom_layout_mnk=(self.tile_m // 64, 1, 1),  # Might need (1, 2, 1) for hdim 512
-            tiler_mn=(64, self.tile_hdimv),
-            a_source=warpgroup.OperandSource.RMEM
-            if self.mma_pv_is_rs
-            else warpgroup.OperandSource.SMEM,
-        )
-        return tiled_mma_qk, tiled_mma_pv
-    def _get_shared_storage_cls(self):
-        sQ_struct, sK_struct, sV_struct = [
-            cute.struct.Align[cute.struct.MemRange[self.dtype, cute.cosize(layout)], self.buffer_align_bytes]
-            for layout in (self.sQ_layout, self.sK_layout, self.sV_layout)
-        ]
-        cosize_sQV = max(cute.cosize(self.sQ_layout), cute.cosize(self.sV_layout))
-        sQV_struct = cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sQV], 1024]
-        cosize_sP = cute.cosize(self.sP_layout) if const_expr(self.sP_layout is not None) else 0
-        sP_struct = cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sP], 1024]
-        # 1 for Q, 1 for O, self.num_stages*2 for K, self.num_stages*2 for V,
-        mbar_ptr_QO_struct = cute.struct.MemRange[cutlass.Int64, 2]
-        mbar_ptr_K_struct = cute.struct.MemRange[cutlass.Int64, self.num_stages * 2]
-        mbar_ptr_V_struct = cute.struct.MemRange[cutlass.Int64, self.num_stages * 2]
-        @cute.struct
-        class SharedStorageQKV:
-            mbar_ptr: mbar_ptr_QO_struct
-            mbar_ptr_K: mbar_ptr_K_struct
-            mbar_ptr_V: mbar_ptr_V_struct
-            sV: sV_struct
-            sQ: sQ_struct
-            sK: sK_struct
-            sP: sP_struct
-        @cute.struct
-        class SharedStorageSharedQV:
-            mbar_ptr: mbar_ptr_QO_struct
-            mbar_ptr_K: mbar_ptr_K_struct
-            mbar_ptr_V: mbar_ptr_V_struct
-            sQ: sQV_struct
-            sK: sK_struct
-            sP: sP_struct
-        return SharedStorageQKV if const_expr(not self.Q_in_regs) else SharedStorageSharedQV
-    @cute.jit
-    def __call__(
-        self,
-        mQ: cute.Tensor,  # (b, s_q, h, d) or (total_q, h, d) if there is cu_seqlens_q
-        mK: cute.Tensor,  # (b_k, s_k, h_k, d) or (total_k, h_k, d) if there is cu_seqlens_k or (num_pages, page_size, h_k, d) if there is page_table
-        mV: cute.Tensor,  # (b_k, s_k, h_k, dv) or (total_k, h_k, dv) if there is cu_seqlens_k or (num_pages, page_size, h_k, dv) if there is page_table
-        mO: cute.Tensor,  # (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
-        mLSE: Optional[cute.Tensor],
-        softmax_scale: Float32,
-        stream: cuda.CUstream,
-        mCuSeqlensQ: Optional[cute.Tensor] = None,
-        mCuSeqlensK: Optional[cute.Tensor] = None,
-        mSeqUsedQ: Optional[cute.Tensor] = None,
-        mSeqUsedK: Optional[cute.Tensor] = None,
-        mPageTable: Optional[cute.Tensor] = None,  # (b_k, max_num_pages_per_seq)
-        window_size_left: Int32 | int | None = None,
-        window_size_right: Int32 | int | None = None,
-        learnable_sink: Optional[cute.Tensor] = None,
-        blocksparse_tensors: Optional[BlockSparseTensors] = None,
-        aux_tensors: Optional[list] = None,
-    ):
-        """Configures and launches the flash attention kernel.
-        mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout:
-        (batch_size, seqlen_q, num_head, head_dim):(_, _, _, 1)
-        """
-        self._check_type(
-            *(
-                t.element_type if t is not None else None
-                for t in (mQ, mK, mV, mO, mLSE, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK)
-            )
-        )
-        mQ, mK, mV, mO = [assume_tensor_aligned(t) for t in (mQ, mK, mV, mO)]
-        QO_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensQ is None) else [0, 2, 1]
-        mQ, mO = [layout_utils.select(t, QO_layout_transpose) for t in (mQ, mO)]
-        KV_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensK is None) else [0, 2, 1]
-        mK, mV = [layout_utils.select(t, KV_layout_transpose) for t in (mK, mV)]
-        LSE_layout_transpose = [2, 1, 0] if const_expr(mCuSeqlensQ is None) else [1, 0]
-        mLSE = layout_utils.select(mLSE, LSE_layout_transpose) if const_expr(mLSE is not None) else None
-        tiled_mma_qk, tiled_mma_pv = self._get_tiled_mma()
-        self.num_mma_threads = tiled_mma_qk.size
-        self.num_threads_per_warp_group = 128
-        self.num_mma_warp_groups = self.num_mma_threads // self.num_threads_per_warp_group
-        self.num_threads = self.num_threads_per_warp_group * (self.num_mma_warp_groups + 1)
-        self.num_producer_threads = 32
-        self.num_Q_load_threads = self.num_mma_threads  # If not TMA_Q, MMA threads load Q
-        self.num_epilogue_threads = self.num_mma_threads
-        self.num_mma_regs = (
-            256
-            if self.num_mma_warp_groups == 1
-            else (240 if self.num_mma_warp_groups == 2 else 160)
-        )
-        self.num_producer_regs = (
-            56 if self.num_mma_warp_groups == 1 else (24 if self.num_mma_warp_groups == 2 else 32)
-        )
-        # self.num_mma_regs = 232
-        # self.num_producer_regs = 40
-        self.use_block_sparsity = cutlass.const_expr(blocksparse_tensors is not None)
-        self.use_scheduler_barrier = (
-            (self.num_mma_warp_groups >= 2 and self.tile_hdim <= 128)
-            if const_expr(self.intra_wg_overlap)
-            else (self.num_mma_warp_groups == 2)
-        )
-        self.use_tma_Q = self.arch >= 90 and not (
-            self.pack_gqa and self.tile_m % self.qhead_per_kvhead != 0
-        )
-        self.use_tma_O = (
-            self.arch >= 90 and mCuSeqlensQ is None and mSeqUsedQ is None and not self.pack_gqa
-        )
-        # TODO: rescale_O_before_gemm
-        self._setup_attributes()
-        # TODO: we prob don't need most of what's in _setup_attributes
-        self.sQ_layout, self.sK_layout, self.sV_layout, self.sO_layout = [
-            sm90_utils.make_smem_layout(mX.element_type, LayoutEnum.ROW_MAJOR, shape, stage)
-            for mX, shape, stage in [
-                (mQ, (self.tile_m, self.tile_hdim), None),
-                (mK, (self.tile_n, self.tile_hdim), self.num_stages),
-                (mV, (self.tile_n, self.tile_hdimv), self.num_stages),
-                (mO, (self.tile_m, self.tile_hdimv), None),
-            ]
-        ]
-        self.sP_layout = None
-        if const_expr(not self.mma_pv_is_rs):
-            self.sP_layout = sm90_utils.make_smem_layout(
-                mV.element_type, LayoutEnum.ROW_MAJOR, (self.tile_m, self.tile_n)
-            )
-        SharedStorage = self._get_shared_storage_cls()
-        if const_expr(self.pack_gqa):
-            shape_Q_packed = (
-                (self.qhead_per_kvhead, mQ.shape[0]),
-                mQ.shape[1],
-                mK.shape[2],
-                *mQ.shape[3:],
-            )
-            stride_Q_packed = (
-                (mQ.stride[2], mQ.stride[0]),
-                mQ.stride[1],
-                mQ.stride[2] * self.qhead_per_kvhead,
-                *mQ.stride[3:],
-            )
-            mQ = cute.make_tensor(
-                mQ.iterator, cute.make_layout(shape_Q_packed, stride=stride_Q_packed)
-            )
-            shape_O_packed = (
-                (self.qhead_per_kvhead, mO.shape[0]),
-                mK.shape[1],
-                mK.shape[2],
-                *mO.shape[3:],
-            )
-            stride_O_packed = (
-                (mO.stride[2], mO.stride[0]),
-                mO.stride[1],
-                mO.stride[2] * self.qhead_per_kvhead,
-                *mO.stride[3:],
-            )
-            mO = cute.make_tensor(
-                mO.iterator, cute.make_layout(shape_O_packed, stride=stride_O_packed)
-            )
-            if const_expr(mLSE is not None):
-                shape_LSE_packed = (
-                    (self.qhead_per_kvhead, mLSE.shape[0]),
-                    mK.shape[2],
-                    *mLSE.shape[2:],
-                )
-                stride_LSE_packed = (
-                    (mLSE.stride[1], mLSE.stride[0]),
-                    mLSE.stride[1] * self.qhead_per_kvhead,
-                    *mLSE.stride[2:],
-                )
-                mLSE = cute.make_tensor(
-                    mLSE.iterator, cute.make_layout(shape_LSE_packed, stride=stride_LSE_packed)
-                )
-        # TMA
-        gmem_tiled_copy_Q = cpasync.CopyBulkTensorTileG2SOp()
-        gmem_tiled_copy_KV = cpasync.CopyBulkTensorTileG2SOp()  # Might multicast
-        gmem_tiled_copy_O = cpasync.CopyBulkTensorTileS2GOp()
-        self.tma_copy_bytes = {
-            name: cute.size_in_bytes(mX.element_type, cute.select(layout, mode=[0, 1]))
-            for name, mX, layout in [
-                ("Q", mQ, self.sQ_layout),
-                ("K", mK, self.sK_layout),
-                ("V", mV, self.sV_layout),
-            ]
-        }
-        tma_atom_Q, tma_tensor_Q = None, None
-        if const_expr(self.use_tma_Q):
-            tma_atom_Q, tma_tensor_Q = cpasync.make_tiled_tma_atom(
-                gmem_tiled_copy_Q,
-                mQ,
-                self.sQ_layout,
-                (self.tile_m, self.tile_hdim),  # No mcast
-            )
-        tma_atom_K, tma_tensor_K = cpasync.make_tiled_tma_atom(
-            gmem_tiled_copy_KV,
-            mK,
-            cute.select(self.sK_layout, mode=[0, 1]),
-            (self.tile_n, self.tile_hdim),
-            1,  # No mcast for now
-        )
-        tma_atom_V, tma_tensor_V = cpasync.make_tiled_tma_atom(
-            gmem_tiled_copy_KV,
-            mV,
-            cute.select(self.sV_layout, mode=[0, 1]),
-            (self.tile_n, self.tile_hdimv),
-            1,  # No mcast for now
-        )
-        tma_atom_O, tma_tensor_O = None, None
-        if const_expr(self.use_tma_O):
-            tma_atom_O, tma_tensor_O = cpasync.make_tiled_tma_atom(
-                gmem_tiled_copy_O,
-                mO,
-                self.sO_layout,
-                (self.tile_m, self.tile_hdimv),  # No mcast
-            )
-        if const_expr(mCuSeqlensQ is not None or mSeqUsedQ is not None):
-            TileScheduler = SingleTileVarlenScheduler
-        else:
-            TileScheduler = (
-                SingleTileScheduler
-                if const_expr(not self.is_causal or self.is_local)
-                else SingleTileLPTScheduler
-            )
-        tile_sched_args = TileSchedulerArguments(
-            cute.ceil_div(cute.size(mQ.shape[0]), self.tile_m),
-            cute.size(mQ.shape[2]),
-            cute.size(mQ.shape[3])
-            if const_expr(mCuSeqlensQ is None)
-            else cute.size(mCuSeqlensQ.shape[0] - 1),
-            1,  # num_splits
-            cute.size(mK.shape[0]),
-            mQ.shape[1],
-            mV.shape[1],
-            total_q=cute.size(mQ.shape[0])
-            if const_expr(mCuSeqlensQ is not None)
-            else cute.size(mQ.shape[0]) * cute.size(mQ.shape[3]),
-            tile_shape_mn=(self.tile_m, self.tile_n),
-            mCuSeqlensQ=mCuSeqlensQ,
-            mSeqUsedQ=mSeqUsedQ,
-            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-            element_size=self.dtype.width // 8,
-            is_persistent=False,
-            lpt=self.is_causal or self.is_local,
-        )
-        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
-        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
-        LOG2_E = math.log2(math.e)
-        if const_expr(self.score_mod is None):
-            softmax_scale_log2 = softmax_scale * LOG2_E
-            softmax_scale = None
-        else:
-            # NB: If a user passes in a score mod, we want to apply the score-mod in the sm_scaled qk
-            # But in the original base 10. We hijack softmax_scale_log2 to just be the change of base
-            # and correctly apply the softmax_scale prior to score_mod in the softmax step
-            softmax_scale_log2 = LOG2_E
-            softmax_scale = softmax_scale
-        if const_expr(window_size_left is not None):
-            window_size_left = Int32(window_size_left)
-        if const_expr(window_size_right is not None):
-            window_size_right = Int32(window_size_right)
-        fastdiv_mods = None
-        if const_expr(aux_tensors is not None):
-            seqlen_q = cute.size(mQ.shape[0]) // (
-                self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1
-            )
-            seqlen_k = (
-                cute.size(mK.shape[0])
-                if const_expr(mPageTable is None)
-                else mK.shape[0] * mPageTable.shape[1]
-            )
-            seqlen_q_divmod = FastDivmodDivisor(seqlen_q)
-            seqlen_k_divmod = FastDivmodDivisor(seqlen_k)
-            fastdiv_mods = (seqlen_q_divmod, seqlen_k_divmod)
-        self.kernel(
-            tma_tensor_Q if const_expr(self.use_tma_Q) else mQ,
-            tma_tensor_K,
-            tma_tensor_V,
-            tma_tensor_O if const_expr(self.use_tma_O) else mO,
-            mLSE,
-            mCuSeqlensQ,
-            mCuSeqlensK,
-            mSeqUsedQ,
-            mSeqUsedK,
-            tma_atom_Q,
-            tma_atom_K,
-            tma_atom_V,
-            tma_atom_O,
-            softmax_scale_log2,
-            softmax_scale,
-            window_size_left,
-            window_size_right,
-            learnable_sink,
-            blocksparse_tensors,
-            self.sQ_layout,
-            self.sK_layout,
-            self.sV_layout,
-            self.sO_layout,
-            self.sP_layout,
-            self.gmem_tiled_copy_Q,
-            self.gmem_tiled_copy_K,
-            self.gmem_tiled_copy_V,
-            self.gmem_tiled_copy_O,
-            tiled_mma_qk,
-            tiled_mma_pv,
-            tile_sched_params,
-            TileScheduler,
-            SharedStorage,
-            aux_tensors,
-            fastdiv_mods,
-        ).launch(
-            grid=grid_dim,
-            block=[self.num_threads, 1, 1],
-            stream=stream,
-            min_blocks_per_mp=1,
-        )
-    @cute.kernel
-    def kernel(
-        self,
-        mQ: cute.Tensor,
-        mK: cute.Tensor,
-        mV: cute.Tensor,
-        mO: cute.Tensor,
-        mLSE: Optional[cute.Tensor],
-        mCuSeqlensQ: Optional[cute.Tensor],
-        mCuSeqlensK: Optional[cute.Tensor],
-        mSeqUsedQ: Optional[cute.Tensor],
-        mSeqUsedK: Optional[cute.Tensor],
-        tma_atom_Q: Optional[cute.CopyAtom],
-        tma_atom_K: Optional[cute.CopyAtom],
-        tma_atom_V: Optional[cute.CopyAtom],
-        tma_atom_O: Optional[cute.CopyAtom],
-        softmax_scale_log2: Float32,
-        softmax_scale: Optional[Float32],
-        window_size_left: Optional[Int32],
-        window_size_right: Optional[Int32],
-        learnable_sink: Optional[cute.Tensor],
-        blocksparse_tensors: Optional[BlockSparseTensors],
-        sQ_layout: cute.ComposedLayout,
-        sK_layout: cute.ComposedLayout,
-        sV_layout: cute.ComposedLayout,
-        sO_layout: cute.ComposedLayout,
-        sP_layout: cute.ComposedLayout | None,
-        gmem_tiled_copy_Q: cute.TiledCopy,
-        gmem_tiled_copy_K: cute.TiledCopy,
-        gmem_tiled_copy_V: cute.TiledCopy,
-        gmem_tiled_copy_O: cute.TiledCopy,
-        tiled_mma_qk: cute.TiledMma,
-        tiled_mma_pv: cute.TiledMma,
-        tile_sched_params: ParamsBase,
-        TileScheduler: cutlass.Constexpr[Callable],
-        SharedStorage: cutlass.Constexpr[Callable],
-        aux_tensors=Optional[list[cute.Tensor]],
-        fastdiv_mods=None,
-    ):
-        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        # Prefetch tma descriptor
-        if warp_idx == 0:
-            for tma_atom in (tma_atom_Q, tma_atom_K, tma_atom_V, tma_atom_O):
-                if const_expr(tma_atom is not None):
-                    cpasync.prefetch_descriptor(tma_atom)
-        smem = cutlass.utils.SmemAllocator()
-        storage = smem.allocate(SharedStorage)
-        # Mbarrier init
-        mbar_ptr_Q = storage.mbar_ptr.data_ptr()
-        if warp_idx == 1:
-            # if tidx < 2:
-            #     # barrierO num threads should be self.num_mma_threads
-            #     cute.arch.mbarrier_init(mbar_ptr_Q + tidx, 1 if tidx == 0 else self.num_mma_threads)
-            if const_expr(not self.use_tma_Q):
-                cute.arch.mbarrier_init(mbar_ptr_Q, self.num_Q_load_threads)
-            # cute.arch.mbarrier_init(mbar_ptr_Q + 1, self.num_mma_threads)
-        # We rely on pipeline_k and pipeline_v to initialize the mbarrier fence and sync
-        pipeline_kv_producer_group = cutlass.pipeline.CooperativeGroup(
-            cutlass.pipeline.Agent.Thread
-        )
-        pipeline_kv_consumer_group = cutlass.pipeline.CooperativeGroup(
-            cutlass.pipeline.Agent.Thread, self.num_mma_threads // cute.arch.WARP_SIZE
-        )
-        pipeline_k = pipeline.PipelineTmaAsync.create(
-            barrier_storage=storage.mbar_ptr_K.data_ptr(),
-            num_stages=self.num_stages,
-            producer_group=pipeline_kv_producer_group,
-            consumer_group=pipeline_kv_consumer_group,
-            tx_count=self.tma_copy_bytes["K"],
-            defer_sync=True,
-        )
-        pipeline_v = pipeline.PipelineTmaAsync.create(
-            barrier_storage=storage.mbar_ptr_V.data_ptr(),
-            num_stages=self.num_stages,
-            producer_group=pipeline_kv_producer_group,
-            consumer_group=pipeline_kv_consumer_group,
-            tx_count=self.tma_copy_bytes["V"],
-            defer_sync=False
-        )
-        # ///////////////////////////////////////////////////////////////////////////////
-        # Get shared memory buffer
-        # ///////////////////////////////////////////////////////////////////////////////
-        sQ = storage.sQ.get_tensor(sQ_layout.outer, swizzle=sQ_layout.inner)
-        sK = storage.sK.get_tensor(sK_layout.outer, swizzle=sK_layout.inner)
-        if const_expr(not self.Q_in_regs):
-            sV = storage.sV.get_tensor(sV_layout.outer, swizzle=sV_layout.inner)
-        else:
-            sV = storage.sQ.get_tensor(
-                sV_layout.outer, swizzle=sV_layout.inner, dtype=mV.element_type
-            )
-        # Transpose view of V to tensor with layout (head_dim_v, tile_n) for tiled mma
-        sVt = layout_utils.transpose_view(sV)
-        sP = None
-        if const_expr(sP_layout is not None):
-            sP = storage.sP.get_tensor(sP_layout.outer, swizzle=sP_layout.inner)
-        # reuse sQ's data iterator
-        sO = storage.sQ.get_tensor(sO_layout.outer, swizzle=sO_layout.inner, dtype=self.dtype)
-        block_info = BlockInfo(
-            self.tile_m,
-            self.tile_n,
-            self.is_causal,
-            self.is_local,
-            False,  # is_split_kv
-            window_size_left,
-            window_size_right,
-            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-        )
-        SeqlenInfoCls = partial(
-            SeqlenInfoQK.create,
-            seqlen_q_static=mQ.shape[0] if const_expr(not self.pack_gqa) else mQ.shape[0][1],
-            seqlen_k_static=mK.shape[0],
-            mCuSeqlensQ=mCuSeqlensQ,
-            mCuSeqlensK=mCuSeqlensK,
-            mSeqUsedQ=mSeqUsedQ,
-            mSeqUsedK=mSeqUsedK,
-        )
-        AttentionMaskCls = partial(
-            AttentionMask,
-            self.tile_m,
-            self.tile_n,
-            window_size_left=window_size_left,
-            window_size_right=window_size_right,
-            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-        )
-        TileSchedulerCls = partial(TileScheduler.create, tile_sched_params)
-        if warp_idx < 4:  # Producer
-            cute.arch.setmaxregister_decrease(self.num_producer_regs)
-            self.load(
-                mQ,
-                mK,
-                mV,
-                sQ,
-                sK,
-                sV,
-                tma_atom_Q,
-                tma_atom_K,
-                tma_atom_V,
-                pipeline_k,
-                pipeline_v,
-                mbar_ptr_Q,
-                blocksparse_tensors,
-                block_info,
-                SeqlenInfoCls,
-                TileSchedulerCls,
-            )
-        else:  # Consumer
-            cute.arch.setmaxregister_increase(self.num_mma_regs)
-            # ///////////////////////////////////////////////////////////////////////////////
-            # Tile MMA compute thread partitions and allocate accumulators
-            # ///////////////////////////////////////////////////////////////////////////////
-            tidx, _, _ = cute.arch.thread_idx()
-            tidx = tidx - 128
-            self.mma(
-                tiled_mma_qk,
-                tiled_mma_pv,
-                mQ,
-                mO,
-                mLSE,
-                sQ,
-                sK,
-                sVt,
-                sP,
-                sO,
-                learnable_sink,
-                pipeline_k,
-                pipeline_v,
-                mbar_ptr_Q,
-                gmem_tiled_copy_Q,
-                gmem_tiled_copy_O,
-                tma_atom_O,
-                tidx,
-                softmax_scale_log2,
-                softmax_scale,
-                block_info,
-                SeqlenInfoCls,
-                AttentionMaskCls,
-                TileSchedulerCls,
-                blocksparse_tensors,
-                aux_tensors,
-                fastdiv_mods,
-            )
-    @cute.jit
-    def load(
-        self,
-        mQ: cute.Tensor,
-        mK: cute.Tensor,
-        mV: cute.Tensor,
-        sQ: cute.Tensor,
-        sK: cute.Tensor,
-        sV: cute.Tensor,
-        tma_atom_Q: cute.CopyAtom,
-        tma_atom_K: cute.CopyAtom,
-        tma_atom_V: cute.CopyAtom,
-        pipeline_k: cutlass.pipeline.PipelineAsync,
-        pipeline_v: cutlass.pipeline.PipelineAsync,
-        mbar_ptr_Q: cutlass.Pointer,
-        blocksparse_tensors: Optional[BlockSparseTensors],
-        block_info: BlockInfo,
-        SeqlenInfoCls: Callable,
-        TileSchedulerCls: Callable,
-    ):
-        warp_idx_in_wg = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
-        if warp_idx_in_wg == 0:
-            q_producer_phase = Int32(1)
-            kv_producer_state = pipeline.make_pipeline_state(
-                cutlass.pipeline.PipelineUserType.Producer, self.num_stages
-            )
-            tile_scheduler = TileSchedulerCls()
-            work_tile = tile_scheduler.initial_work_tile_info()
-            while work_tile.is_valid_tile:
-                # if work_tile.is_valid_tile:
-                m_block, head_idx, batch_idx, _ = work_tile.tile_idx
-                seqlen = SeqlenInfoCls(batch_idx)
-                mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
-                head_idx_kv = (
-                    head_idx // self.qhead_per_kvhead if const_expr(not self.pack_gqa) else head_idx
-                )
-                mK_cur = seqlen.offset_batch_K(mK, batch_idx, dim=3)[None, None, head_idx_kv]
-                mV_cur = seqlen.offset_batch_K(mV, batch_idx, dim=3)[None, None, head_idx_kv]
-                gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (None, 0))
-                gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (None, 0))
-                load_Q = None
-                if const_expr(self.use_tma_Q):
-                    gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (m_block, 0))
-                    load_Q, _, _ = copy_utils.tma_get_copy_fn(
-                        tma_atom_Q, 0, cute.make_layout(1), gQ, sQ, single_stage=True
-                    )
-                # TODO: mcast
-                # TODO check warp_idx if we have 128 producer threads
-                load_K, _, _ = copy_utils.tma_get_copy_fn(
-                    tma_atom_K, 0, cute.make_layout(1), gK, sK
-                )
-                load_K = copy_utils.tma_producer_copy_fn(load_K, pipeline_k)
-                load_V, _, _ = copy_utils.tma_get_copy_fn(
-                    tma_atom_V, 0, cute.make_layout(1), gV, sV
-                )
-                load_V = copy_utils.tma_producer_copy_fn(load_V, pipeline_v)
-                if const_expr(not self.use_block_sparsity):
-                    n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
-                    # if cute.arch.thread_idx()[0] == 0:
-                    #     cute.printf("m_block = %d, n_block_min: %d, n_block_max: %d", m_block, n_block_min, n_block_max)
-                    # First iteration: load both Q & K with the same mbarrier
-                    n_block = n_block_max - 1
-                    pipeline_k.producer_acquire(
-                        kv_producer_state,
-                        extra_tx_count=self.tma_copy_bytes["Q"]
-                        if const_expr(self.use_tma_Q)
-                        else 0,
-                    )
-                    if const_expr(self.use_tma_Q):
-                        load_Q(tma_bar_ptr=pipeline_k.producer_get_barrier(kv_producer_state))
-                    load_K(src_idx=n_block, producer_state=kv_producer_state)
-                    if const_expr(not self.intra_wg_overlap):
-                        pipeline_v.producer_acquire(kv_producer_state)
-                        load_V(src_idx=n_block, producer_state=kv_producer_state)
-                        kv_producer_state.advance()
-                        for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
-                            n_block = n_block_max - 1 - i - 1
-                            pipeline_k.producer_acquire(kv_producer_state)
-                            load_K(src_idx=n_block, producer_state=kv_producer_state)
-                            pipeline_v.producer_acquire(kv_producer_state)
-                            load_V(src_idx=n_block, producer_state=kv_producer_state)
-                            kv_producer_state.advance()
-                    else:
-                        for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
-                            n_block_prev = n_block_max - i - 1
-                            n_block = n_block_prev - 1
-                            kv_producer_state_prev = kv_producer_state.clone()
-                            kv_producer_state.advance()
-                            pipeline_k.producer_acquire(kv_producer_state)
-                            load_K(src_idx=n_block, producer_state=kv_producer_state)
-                            pipeline_v.producer_acquire(kv_producer_state_prev)
-                            load_V(src_idx=n_block_prev, producer_state=kv_producer_state_prev)
-                        n_block = n_block_min
-                        pipeline_v.producer_acquire(kv_producer_state)
-                        load_V(src_idx=n_block, producer_state=kv_producer_state)
-                        kv_producer_state.advance()
-                else:
-                    kv_producer_state = produce_block_sparse_loads(
-                        blocksparse_tensors,
-                        batch_idx,
-                        head_idx,
-                        m_block,
-                        kv_producer_state,
-                        load_Q,
-                        load_K,
-                        load_V,
-                        pipeline_k,
-                        pipeline_v,
-                        self.use_tma_Q,
-                        self.tma_copy_bytes["Q"],
-                        self.intra_wg_overlap,
-                        self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-                        self.q_subtile_factor if self.q_subtile_factor is not None else 1,
-                    )
-                tile_scheduler.prefetch_next_work()
-                tile_scheduler.advance_to_next_work()
-                work_tile = tile_scheduler.get_current_work()
-                # End of persistent scheduler loop
-    @cute.jit
-    def mma(
-        self,
-        tiled_mma_qk: cute.TiledMma,
-        tiled_mma_pv: cute.TiledMma,
-        # softmax: Softmax,
-        # acc_O: cute.Tensor,
-        mQ: cute.Tensor,
-        mO: cute.Tensor,
-        mLSE: Optional[cute.Tensor],
-        sQ: cute.Tensor,
-        sK: cute.Tensor,
-        sVt: cute.Tensor,
-        sP: Optional[cute.Tensor],
-        sO: cute.Tensor,
-        learnable_sink: Optional[cute.Tensor],
-        pipeline_k: cutlass.pipeline.PipelineAsync,
-        pipeline_v: cutlass.pipeline.PipelineAsync,
-        mbar_ptr_Q: cutlass.Pointer,
-        gmem_tiled_copy_Q: cute.TiledCopy,
-        gmem_tiled_copy_O: cute.TiledCopy,
-        tma_atom_O: Optional[cute.CopyAtom],
-        tidx: Int32,
-        softmax_scale_log2: Float32,
-        softmax_scale: Optional[Float32],
-        block_info: BlockInfo,
-        SeqlenInfoCls: Callable,
-        AttentionMaskCls: Callable,
-        TileSchedulerCls: Callable,
-        blocksparse_tensors: Optional[BlockSparseTensors],
-        aux_tensors: Optional[list],
-        fastdiv_mods=None,
-    ):
-        warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
-        warp_group_thread_layout = cute.make_layout(
-            self.num_mma_warp_groups, stride=self.num_threads_per_warp_group
-        )
-        thr_mma_qk = tiled_mma_qk.get_slice(tidx)
-        wg_mma_qk = tiled_mma_qk.get_slice(warp_group_thread_layout(warp_group_idx))
-        wg_mma_pv = tiled_mma_pv.get_slice(warp_group_thread_layout(warp_group_idx))
-        _, tSrQ, tSrK = sm90_utils.partition_fragment_ABC(
-            wg_mma_qk, (self.tile_m, self.tile_n, self.tile_hdim), sQ, sK
-        )
-        mma_qk_fn = partial(
-            sm90_utils.gemm_zero_init, tiled_mma_qk, (self.tile_m, self.tile_n), tSrQ, tSrK
-        )
-        acc_O, tOrP, tOrVt = sm90_utils.partition_fragment_ABC(
-            wg_mma_pv, (self.tile_m, self.tile_hdimv, self.tile_n), sP, sVt
-        )
-        mma_pv_fn = partial(sm90_utils.gemm_w_idx, tiled_mma_pv, acc_O, tOrP, tOrVt)
-        # ///////////////////////////////////////////////////////////////////////////////
-        # Smem copy atom tiling
-        # ///////////////////////////////////////////////////////////////////////////////
-        smem_copy_atom_P = utils.get_smem_store_atom(self.arch, self.dtype)
-        smem_thr_copy_P = cute.make_tiled_copy_C(smem_copy_atom_P, tiled_mma_qk).get_slice(tidx)
-        tPsP = smem_thr_copy_P.partition_D(sP) if const_expr(sP is not None) else None
-        smem_copy_params = SimpleNamespace(smem_thr_copy_P=smem_thr_copy_P, tPsP=tPsP)
-        self.mma_init()
-        mma_one_n_block_all = partial(
-            self.mma_one_n_block_intrawg_overlap
-            if const_expr(self.intra_wg_overlap)
-            else self.mma_one_n_block,
-            mma_qk_fn=mma_qk_fn,
-            pipeline_k=pipeline_k,
-            pipeline_v=pipeline_v,
-            acc_O=acc_O,
-            tOrP=tOrP,
-            smem_copy_params=smem_copy_params,
-            check_inf=True,
-        )
-        q_consumer_phase = Int32(0)
-        kv_consumer_state = pipeline.make_pipeline_state(
-            cutlass.pipeline.PipelineUserType.Consumer, self.num_stages
-        )
-        tile_scheduler = TileSchedulerCls()
-        work_tile = tile_scheduler.initial_work_tile_info()
-        softmax = Softmax.create(
-            softmax_scale_log2,
-            num_rows=acc_O.shape[0][0] * acc_O.shape[1],
-            softmax_scale=softmax_scale,
-        )
-        process_first_half_block = partial(
-            self.first_half_block_overlap,
-            mma_qk_fn=mma_qk_fn,
-            pipeline_k=pipeline_k,
-            tOrP=tOrP,
-            smem_copy_params=smem_copy_params,
-            softmax=softmax,
-        )
-        process_last_half_block = partial(
-            self.last_half_block_overlap,
-            pipeline_v=pipeline_v,
-            mma_pv_fn=mma_pv_fn,
-        )
-        while work_tile.is_valid_tile:
-            # if work_tile.is_valid_tile:
-            # shape: (atom_v_m * rest_m)
-            m_block, head_idx, batch_idx, _ = work_tile.tile_idx
-            seqlen = SeqlenInfoCls(batch_idx)
-            # Recompute fastdiv_mods if necessary for varlen with aux_tensors
-            recompute_fastdiv_mods_q = cutlass.const_expr(
-                aux_tensors is not None and (seqlen.has_cu_seqlens_q or seqlen.has_seqused_q)
-            )
-            recompute_fastdiv_mods_k = cutlass.const_expr(
-                aux_tensors is not None and (seqlen.has_cu_seqlens_k or seqlen.has_seqused_k)
-            )
-            if cutlass.const_expr(fastdiv_mods is not None):
-                seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
-                fastdiv_mods = (
-                    seqlen_q_divmod
-                    if not recompute_fastdiv_mods_q
-                    else FastDivmodDivisor(seqlen.seqlen_q),
-                    seqlen_k_divmod
-                    if not recompute_fastdiv_mods_k
-                    else FastDivmodDivisor(seqlen.seqlen_k),
-                )
-            mask = AttentionMaskCls(seqlen)
-            mask_fn = partial(
-                mask.apply_mask,
-                batch_idx=batch_idx,
-                head_idx=head_idx,
-                m_block=m_block,
-                thr_mma=thr_mma_qk,
-                mask_causal=self.is_causal,
-                mask_local=self.is_local,
-                aux_tensors=aux_tensors,
-                fastdiv_mods=fastdiv_mods,
-            )
-            score_mod_fn = None
-            if const_expr(self.score_mod is not None):
-                score_mod_fn = partial(
-                    self.apply_score_mod,
-                    thr_mma_qk,
-                    batch_idx,
-                    head_idx,
-                    m_block,
-                    softmax_scale=softmax_scale,
-                    aux_tensors=aux_tensors,
-                    fastdiv_mods=fastdiv_mods,
-                )
-            mma_one_n_block = partial(
-                mma_one_n_block_all,
-                seqlen=seqlen,
-                softmax=softmax,
-                score_mod_fn=score_mod_fn,
-            )
-            # Load Q if not TMA_Q
-            if const_expr(not self.use_tma_Q):
-                pack_gqa = PackGQA(
-                    self.tile_m, self.tile_hdim, self.check_hdim_oob, self.qhead_per_kvhead
-                )
-                mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
-                # gmem_thr_copy_Q = gmem_tiled_copy_Q.get_slice(tidx)
-                # gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (m_block, 0))
-                # self.load_Q(gmem_thr_copy_Q, gQ, sQ, m_block, seqlen=seqlen.seqlen_q,
-                #             headdim=mQ.shape[1])
-                pack_gqa.load_Q(mQ_cur, sQ, gmem_tiled_copy_Q, tidx, m_block, seqlen.seqlen_q)
-                cute.arch.cp_async_mbarrier_arrive_noinc(mbar_ptr_Q)
-            n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
-            if const_expr(not self.use_tma_Q):
-                cute.arch.mbarrier_wait(mbar_ptr_Q, phase=q_consumer_phase)
-            q_consumer_phase ^= 1
-            # For performance reason, we separate out two kinds of iterations:
-            # those that need masking on S, and those that don't.
-            # We need masking on S for the very last block when K and V has length not multiple of tile_n.
-            # We also need masking on S if it's causal, for the last several blocks.
-            # softmax.reset()  # Don't need reset as we explicitly call softmax w is_first=True
-            O_should_accumulate = False
-            # ==========================================
-            # MAINLOOP
-            # ==========================================
-            if const_expr(not self.use_block_sparsity):
-                # ==========================================
-                # No block-sparsity (original path)
-                # ==========================================
-                # First iteration with seqlen masking
-                if const_expr(self.intra_wg_overlap):
-                    kv_consumer_state = process_first_half_block(
-                        n_block=n_block_max - 1,
-                        seqlen=seqlen,
-                        kv_consumer_state=kv_consumer_state,
-                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod),
-                        score_mod_fn=score_mod_fn,
-                        is_first_block=True,
-                    )
-                    # Need to initialize tOrO in the case of RescaleOBeforeGemm where we will scale tOrO even in the 1st iter
-                    # acc_O.fill(0.0)
-                else:
-                    self.warp_scheduler_barrier_sync()
-                    kv_consumer_state = mma_one_n_block(
-                        kv_consumer_state,
-                        n_block=n_block_max - 1,
-                        seqlen=seqlen,
-                        mma_pv_fn=partial(mma_pv_fn, zero_init=True),
-                        is_first_n_block=True,
-                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=True),
-                    )
-                    O_should_accumulate = True
-                # if cute.arch.thread_idx()[0] == 128: cute.printf("m_block = {}, n_block_max = {}, n_block_min = {}", m_block, n_block_max, n_block_min)
-                n_block_max -= 1
-                # Next couple of iterations with causal masking
-                if const_expr(self.is_causal or self.is_local):
-                    n_block_min_causal_local_mask = block_info.get_n_block_min_causal_local_mask(
-                        seqlen, m_block, n_block_min
-                    )
-                    # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block_min_causal_local_mask = {}", n_block_min_causal_local_mask)
-                    for n_tile in cutlass.range(
-                        n_block_max - n_block_min_causal_local_mask, unroll=1
-                    ):
-                        kv_consumer_state = mma_one_n_block(
-                            kv_consumer_state,
-                            n_block=n_block_max - 1 - n_tile,
-                            seqlen=seqlen,
-                            mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
-                            mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
-                        )
-                        O_should_accumulate = True
-                    n_block_max = cutlass.min(n_block_max, n_block_min_causal_local_mask)
-                # The remaining iterations have no masking
-                n_block_min_before_local_mask = block_info.get_n_block_min_before_local_mask(
-                    seqlen, m_block, n_block_min
-                )
-                # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block_min_before_local_mask = {}, n_block_min = {}", n_block_min_before_local_mask, n_block_min)
-                for n_tile in cutlass.range(n_block_max - n_block_min_before_local_mask, unroll=1):
-                    kv_consumer_state = mma_one_n_block(
-                        kv_consumer_state,
-                        n_block=n_block_max - 1 - n_tile,
-                        seqlen=seqlen,
-                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
-                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
-                    )
-                    O_should_accumulate = True
-                # Separate iterations with local masking on the left
-                if const_expr(self.is_local and block_info.window_size_left is not None):
-                    n_block_max = cutlass.min(n_block_max, n_block_min_before_local_mask)
-                    for n_tile in cutlass.range(n_block_max - n_block_min, unroll=1):
-                        kv_consumer_state = mma_one_n_block(
-                            kv_consumer_state,
-                            n_block=n_block_max - 1 - n_tile,
-                            seqlen=seqlen,
-                            mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
-                            mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
-                        )
-                        O_should_accumulate = True
-                # Last "half" iteration
-                if const_expr(self.intra_wg_overlap):
-                    kv_consumer_state = process_last_half_block(
-                        kv_consumer_state=kv_consumer_state,
-                        zero_init=not O_should_accumulate,
-                    )
-                    O_should_accumulate = True
-                else:
-                    self.warp_scheduler_barrier_arrive()
-            else:
-                # ==========================================
-                # Block sparsity
-                # ==========================================
-                kv_consumer_state, O_should_accumulate, processed_any = consume_block_sparse_loads(
-                    blocksparse_tensors,
-                    batch_idx,
-                    head_idx,
-                    m_block,
-                    seqlen,
-                    kv_consumer_state,
-                    mma_pv_fn,
-                    mma_one_n_block,
-                    process_first_half_block,
-                    process_last_half_block,
-                    mask_fn,
-                    score_mod_fn,
-                    O_should_accumulate,
-                    self.mask_mod,
-                    fastdiv_mods,
-                    self.intra_wg_overlap,
-                    self.warp_scheduler_barrier_sync,
-                    self.warp_scheduler_barrier_arrive,
-                    self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-                    self.q_subtile_factor if self.q_subtile_factor is not None else 1,
-                )
-                # Handle empty case (when no blocks to process)
-                if not processed_any:
-                    softmax.reset()
-                    acc_O.fill(0.0)
-            sink_val = None
-            if const_expr(learnable_sink is not None):
-                if const_expr(not self.pack_gqa):
-                    sink_val = Float32(learnable_sink[head_idx])
-                else:  # Each thread might have a different sink value due to different q_head
-                    sink_val = cute.make_fragment_like(softmax.row_max, Float32)
-                    cS = cute.make_identity_tensor((self.tile_m, self.tile_n))
-                    tScS_mn = layout_utils.reshape_acc_to_mn(thr_mma_qk.partition_C(cS))
-                    for r in cutlass.range(cute.size(sink_val), unroll_full=True):
-                        row = m_block * self.tile_m + tScS_mn[r][0]
-                        q_head_idx = row % self.qhead_per_kvhead + head_idx * self.qhead_per_kvhead
-                        sink_val[r] = Float32(learnable_sink[q_head_idx])
-            # normalize acc_O by row_sum and calculate the lse
-            row_scale = softmax.finalize(sink_val=sink_val)
-            softmax.rescale_O(acc_O, row_scale)
-            # ///////////////////////////////////////////////////////////////////////////////
-            # Epilogue
-            # ///////////////////////////////////////////////////////////////////////////////
-            self.epilogue(
-                acc_O,
-                softmax.row_sum,
-                mO,
-                mLSE,
-                sO,
-                seqlen,
-                gmem_tiled_copy_O,
-                tma_atom_O,
-                tiled_mma_pv,
-                tidx,
-                m_block,
-                head_idx,
-                batch_idx,
-            )
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
-    @cute.jit
-    def first_half_block_overlap(
-        self,
-        n_block: Int32,
-        mma_qk_fn: Callable,
-        kv_consumer_state,
-        pipeline_k,
-        tOrP: cute.Tensor,
-        smem_copy_params: SimpleNamespace,
-        softmax: Softmax,
-        seqlen: SeqlenInfoQK,
-        mask_fn: Callable = None,
-        score_mod_fn: Optional[Callable] = None,
-        is_first_block: bool = False,
-    ):
-        """Processes the first half block when using intra-warpgroup-overlap"""
-        pipeline_k.consumer_wait(kv_consumer_state, pipeline_k.consumer_try_wait(kv_consumer_state))
-        acc_S = mma_qk_fn(B_idx=kv_consumer_state.index, wg_wait=0)
-        pipeline_k.consumer_release(kv_consumer_state)
-        # Apply score modification if present
-        if const_expr(score_mod_fn is not None):
-            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
-        # Apply mask; mask_seqlen always True for first block
-        # Caveat: if full block further right than mask block, seqlen masking is redundant;
-        # however, masking is being applied anyway, so essentially no perf hit
-        mask_fn(acc_S, n_block=n_block, mask_seqlen=True)
-        softmax.online_softmax(acc_S, is_first=is_first_block)
-        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
-        tOrP_cur = (
-            tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-        )
-        tOrP_cur.store(tOrP_acc.load().to(self.dtype))
-        # if pv gemm not rs
-        if const_expr(not self.mma_pv_is_rs):
-            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
-            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
-            # Fence and barrier to make smem store visible to WGMMA
-            cute.arch.fence_view_async_shared()
-            cute.arch.sync_warp()
-        return kv_consumer_state
-    @cute.jit
-    def last_half_block_overlap(
-        self,
-        kv_consumer_state,
-        pipeline_v,
-        mma_pv_fn: Callable,
-        zero_init: bool,
-    ):
-        """Processes the final PV GEMM when using intra-warpgroup-overlap"""
-        pipeline_v.consumer_wait(kv_consumer_state, pipeline_v.consumer_try_wait(kv_consumer_state))
-        mma_pv_fn(B_idx=kv_consumer_state.index, zero_init=zero_init, wg_wait=0)
-        pipeline_v.consumer_release(kv_consumer_state)
-        kv_consumer_state.advance()
-        return kv_consumer_state
-    @cute.jit
-    def mma_one_n_block(
-        self,
-        smem_pipe_read: cutlass.pipeline.PipelineState | pipeline.PipelineStateSimple,
-        n_block: Int32,
-        mma_qk_fn: Callable,
-        mma_pv_fn: Callable,
-        pipeline_k: cutlass.pipeline.PipelineAsync,
-        pipeline_v: cutlass.pipeline.PipelineAsync,
-        acc_O: cute.Tensor,
-        tOrP: cute.Tensor,
-        smem_copy_params: SimpleNamespace,
-        softmax: Softmax,
-        seqlen: SeqlenInfoQK,
-        score_mod_fn: Optional[Callable] = None,
-        mask_fn: Optional[Callable] = None,
-        is_first_n_block: cutlass.Constexpr = False,
-        check_inf: cutlass.Constexpr = True,
-    ):
-        pipeline_k.consumer_wait(smem_pipe_read, pipeline_k.consumer_try_wait(smem_pipe_read))
-        # S = Q @ K.T
-        acc_S = mma_qk_fn(B_idx=smem_pipe_read.index, wg_wait=-1)
-        self.warp_scheduler_barrier_arrive()
-        warpgroup.wait_group(0)
-        pipeline_k.consumer_release(smem_pipe_read)
-        # handle score mods and masking
-        if const_expr(score_mod_fn is not None):
-            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
-        if const_expr(mask_fn is not None):
-            mask_fn(acc_S=acc_S, n_block=n_block)
-        row_scale = softmax.online_softmax(acc_S, is_first=is_first_n_block, check_inf=check_inf)
-        # if cute.arch.thread_idx()[0] == 0: cute.print_tensor(layout_utils.reshape_acc_to_mn(acc_S))
-        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
-        tOrP_cur = (
-            tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-        )
-        # tOrP.store(tOrP_acc.load().to(self.dtype))
-        # the "to(self.dtype)" conversion fails to vectorize for block sizes other
-        # than 128 x 128, i.e. it calls convert on 1 fp32 element at a time instead of
-        # 2 elements. So we just call ptx directly.
-        utils.cvt_f16(tOrP_acc, tOrP_cur)
-        if const_expr(not self.mma_pv_is_rs):
-            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
-            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
-        softmax.rescale_O(acc_O, row_scale)
-        if const_expr(not self.mma_pv_is_rs):
-            # Fence and barrier to make sure smem store is visible to WGMMA
-            cute.arch.fence_view_async_shared()
-            cute.arch.sync_warp()  # Only need syncwarp since each warp is using its own P values for MmaPV
-        pipeline_v.consumer_wait(smem_pipe_read, pipeline_v.consumer_try_wait(smem_pipe_read))
-        self.warp_scheduler_barrier_sync()
-        # O += P @ V
-        mma_pv_fn(B_idx=smem_pipe_read.index, wg_wait=0)
-        pipeline_v.consumer_release(smem_pipe_read)
-        smem_pipe_read.advance()
-        return smem_pipe_read
-    @cute.jit
-    def mma_one_n_block_intrawg_overlap(
-        self,
-        smem_pipe_read: cutlass.pipeline.PipelineState | pipeline.PipelineStateSimple,
-        n_block: Int32,
-        mma_qk_fn: Callable,
-        mma_pv_fn: Callable,
-        pipeline_k: cutlass.pipeline.PipelineAsync,
-        pipeline_v: cutlass.pipeline.PipelineAsync,
-        acc_O: cute.Tensor,
-        tOrP: cute.Tensor,
-        smem_copy_params: SimpleNamespace,
-        softmax: Softmax,
-        seqlen: SeqlenInfoQK,
-        score_mod_fn: Optional[Callable] = None,
-        mask_fn: Optional[Callable] = None,
-        check_inf: cutlass.Constexpr = True,
-    ):
-        smem_pipe_read_v = smem_pipe_read.clone()
-        smem_pipe_read.advance()
-        pipeline_k.consumer_wait(smem_pipe_read, pipeline_k.consumer_try_wait(smem_pipe_read))
-        self.warp_scheduler_barrier_sync()
-        # S = Q @ K.T
-        acc_S = mma_qk_fn(B_idx=smem_pipe_read.index, wg_wait=-1)
-        pipeline_v.consumer_wait(smem_pipe_read_v, pipeline_v.consumer_try_wait(smem_pipe_read_v))
-        # O += P @ V
-        mma_pv_fn(B_idx=smem_pipe_read_v.index, wg_wait=-1)
-        self.warp_scheduler_barrier_arrive()
-        warpgroup.wait_group(1)
-        pipeline_k.consumer_release(smem_pipe_read)
-        # handle score mods and masking
-        if const_expr(score_mod_fn is not None):
-            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
-        if const_expr(mask_fn is not None):
-            mask_fn(acc_S=acc_S, n_block=n_block)
-        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(layout_utils.reshape_acc_to_mn(acc_S))
-        row_scale = softmax.online_softmax(acc_S, check_inf=check_inf)
-        warpgroup.wait_group(0)
-        pipeline_v.consumer_release(smem_pipe_read_v)
-        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
-        tOrP_cur = (
-            tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-        )
-        # tOrP_cur.store(tOrP_acc.load().to(self.dtype))
-        # the "to(self.dtype)" conversion fails to vectorize for block sizes other
-        # than 128 x 128, i.e. it calls convert on 1 fp32 element at a time instead of
-        # 2 elements. So we just call ptx directly.
-        utils.cvt_f16(tOrP_acc, tOrP_cur)
-        if const_expr(not self.mma_pv_is_rs):
-            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
-            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
-        softmax.rescale_O(acc_O, row_scale)
-        if const_expr(not self.mma_pv_is_rs):
-            # Fence and barrier to make sure smem store is visible to WGMMA
-            cute.arch.fence_view_async_shared()
-            cute.arch.sync_warp()  # Only need syncwarp since each warp is using its own P values for MmaPV
-        return smem_pipe_read
-    @cute.jit
-    def mma_init(self):
-        warp_group_idx = utils.canonical_warp_group_idx(sync=False)
-        if const_expr(self.use_scheduler_barrier):
-            if warp_group_idx == 1:
-                cute.arch.barrier_arrive(
-                    barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1),
-                    number_of_threads=2 * self.num_threads_per_warp_group,
-                )
-    @cute.jit
-    def apply_score_mod(
-        self,
-        thr_mma_qk,
-        batch_idx,
-        head_idx,
-        m_block,
-        acc_S,
-        n_block,
-        softmax_scale,
-        seqlen,
-        aux_tensors: Optional[list] = None,
-        fastdiv_mods=None,
-    ):
-        # Prepare index tensor
-        cS = cute.make_identity_tensor((self.tile_m, self.tile_n))
-        cS = cute.domain_offset((m_block * self.tile_m, n_block * self.tile_n), cS)
-        tScS = thr_mma_qk.partition_C(cS)
-        apply_score_mod_inner(
-            acc_S,
-            tScS,
-            self.score_mod,
-            batch_idx,
-            head_idx,
-            softmax_scale,
-            self.vec_size,
-            self.qk_acc_dtype,
-            aux_tensors,
-            fastdiv_mods,
-            seqlen_info=seqlen,
-            constant_q_idx=None,
-            qhead_per_kvhead=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
-        )
-    def warp_scheduler_barrier_sync(self):
-        if const_expr(self.use_scheduler_barrier):
-            cute.arch.barrier(
-                barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1)
-                - 1
-                + utils.canonical_warp_group_idx(sync=False),
-                number_of_threads=2 * self.num_threads_per_warp_group,
-            )
-    def warp_scheduler_barrier_arrive(self):
-        if const_expr(self.use_scheduler_barrier):
-            assert self.num_mma_warp_groups in [2, 3]
-            cur_wg = utils.canonical_warp_group_idx(sync=False) - 1
-            if const_expr(self.num_mma_warp_groups == 2):
-                next_wg = 1 - cur_wg
-            else:
-                t = cur_wg + 1
-                next_wg = t % self.num_mma_warp_groups
-            cute.arch.barrier_arrive(
-                barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1) + next_wg,
-                number_of_threads=2 * self.num_threads_per_warp_group,
-            )

 import cutlass
 import cutlass.cute as cute
 from cutlass import Constexpr, Float32, Int32, const_expr, Boolean
+from cutlass.cute.nvgpu import cpasync, warp
 import cutlass.utils as utils_basic
+from cutlass.base_dsl.arch import Arch
+from cutlass.cutlass_dsl import BaseDSL
 from .quack import copy_utils
 from .quack import layout_utils
 from . import ampere_helpers as sm80_utils
 from .cute_dsl_utils import assume_tensor_aligned
 from . import utils
 from .mask import AttentionMask
+from .softmax import Softmax
 from .seqlen_info import SeqlenInfoQK
 from .block_info import BlockInfo
 from .pack_gqa import PackGQA
 from .named_barrier import NamedBarrierFwd
+from .block_sparsity import BlockSparseTensors
+from .tile_scheduler import SingleTileScheduler, SingleTileVarlenScheduler, TileSchedulerArguments
 class FlashAttentionForwardBase:
     def __init__(
         self,
         self.vec_size: cutlass.Constexpr = getattr(
             score_mod, "__vec_size__", 1 if cutlass.const_expr(has_aux_tensors) else 2
         )
+        if self.vec_size > 2:
+            raise ValueError(
+                f"score_mod vec_size {self.vec_size} not supported on Sm80/90/120 "
+                "due to accumulator thread ownership pattern."
+            )
+        self.arch = BaseDSL._get_dsl().get_arch_enum()
     @staticmethod
     def can_implement(
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
         softmax_scale: Float32,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         """Configures and launches the flash attention kernel.
         cute.arch.barrier(
             barrier_id=int(NamedBarrierFwd.Epilogue), number_of_threads=self.num_epilogue_threads
         )
+        smem_copy_atom_O = utils.get_smem_store_atom(self.arch.major * 10 + self.arch.minor, self.dtype)
         smem_thr_copy_O = cute.make_tiled_copy_C(smem_copy_atom_O, tiled_mma).get_slice(tidx)
         taccOrO = smem_thr_copy_O.retile(rO)
         taccOsO = smem_thr_copy_O.partition_D(sO)
         # Write LSE from rmem -> gmem
         if const_expr(mLSE is not None):
+            mLSE_cur = seqlen.offset_batch_Q(mLSE, batch_idx, dim=2)[None, head_idx]
             if const_expr(not self.pack_gqa):
                 gLSE = cute.local_tile(mLSE_cur, (self.tile_m,), (m_block,))
                 gLSE_expanded_layout = cute.append(
                 t0accOcO = layout_utils.reshape_acc_to_mn(thr_mma.get_slice(0).partition_C(cO))
                 # Only the thread corresponding to column 0 writes out the lse to gmem
                 if taccOcO[0][1] == 0:
+                    for m in cutlass.range(cute.size(taccOgLSE.shape[1]), unroll_full=True):
                         if (
                             t0accOcO[m, 0][0]
                             < seqlen.seqlen_q - m_block * self.tile_m - taccOcO[0][0]
             else:
                 pack_gqa.store_LSE(mLSE_cur, lse, tiled_mma, tidx, m_block, seqlen.seqlen_q)
+        ragged = self.use_tma_O and (seqlen.has_cu_seqlens_q or seqlen.has_seqused_q)
+        mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3, ragged=ragged)[None, None, head_idx]
         # thr_mma = tiled_mma.get_slice(tidx)
         # taccOgO = thr_mma.partition_C(gO)
         # cute.autovec_copy(rO, taccOgO)
         mV: cute.Tensor,
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
+        softmax_scale: Float32,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        mPageTable: Optional[cute.Tensor] = None,
         window_size_left: Optional[Int32] = None,
         window_size_right: Optional[Int32] = None,
         learnable_sink: Optional[cute.Tensor] = None,
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
         aux_tensors=None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         """Configures and launches the flash attention kernel.
         """
         assert learnable_sink is None, "Learnable sink is not supported in this kernel"
         self._check_type(
+            *(t.element_type if t is not None else None for t in (mQ, mK, mV, mO, mLSE, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK))
         )
         tiled_mma_qk, tiled_mma_pv = self._get_tiled_mma()
         self.num_mma_threads = tiled_mma_pv.size
         self.num_Q_load_threads = self.num_threads
         self.num_epilogue_threads = self.num_threads
         # self.use_tma_O = self.arch >= 90 and mCuSeqlensQ is None
+        self.use_tma_O = self.arch >= Arch.sm_90
         self._setup_attributes()
         SharedStorage = self._get_shared_storage_cls()
         mQ, mK, mV, mO = [assume_tensor_aligned(t) for t in (mQ, mK, mV, mO)]
+        # Layout permutation: 4D non-varlen vs 3D varlen
+        QO_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensQ is None) else [0, 2, 1]
+        KV_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensK is None) else [0, 2, 1]
+        mQ, mO = [
+            cute.make_tensor(t.iterator, cute.select(t.layout, mode=QO_layout_transpose))
+            for t in (mQ, mO)
         ]
+        mK, mV = [
+            cute.make_tensor(t.iterator, cute.select(t.layout, mode=KV_layout_transpose))
+            for t in (mK, mV)
+        ]
+        if const_expr(mLSE is not None):
+            LSE_layout_transpose = [2, 1, 0] if const_expr(mCuSeqlensQ is None) else [1, 0]
+            mLSE = cute.make_tensor(mLSE.iterator, cute.select(mLSE.layout, mode=LSE_layout_transpose))
+        # TileScheduler for varlen, simple grid for non-varlen
+        if const_expr(mCuSeqlensQ is not None or mSeqUsedQ is not None):
+            TileScheduler = SingleTileVarlenScheduler
         else:
+            TileScheduler = SingleTileScheduler
+        num_batch = (
+            mCuSeqlensQ.shape[0] - 1
+            if const_expr(mCuSeqlensQ is not None)
+            else mQ.shape[3]
+        )
+        tile_sched_args = TileSchedulerArguments(
+            num_block=cute.ceil_div(mQ.shape[0], self.tile_m),
+            num_head=cute.size(mQ.shape[2]),
+            num_batch=num_batch,
+            num_splits=1,
+            seqlen_k=0,
+            headdim=mQ.shape[1],
+            headdim_v=mV.shape[1],
+            total_q=cute.size(mQ.shape[0])
+            if const_expr(mCuSeqlensQ is not None)
+            else cute.size(mQ.shape[0]) * cute.size(mQ.shape[3]),
+            tile_shape_mn=(self.tile_m, self.tile_n),
+            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+            mCuSeqlensQ=mCuSeqlensQ,
+            mSeqUsedQ=mSeqUsedQ,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        softmax_scale_log2, softmax_scale = utils.compute_softmax_scale_log2(softmax_scale, self.score_mod)
+        fastdiv_mods = utils.compute_fastdiv_mods(mQ, mK, self.qhead_per_kvhead, self.pack_gqa, aux_tensors)
         self.kernel(
             mQ,
             mV,
             mO,
             mLSE,
+            mCuSeqlensQ,
+            mCuSeqlensK,
+            mSeqUsedQ,
+            mSeqUsedK,
             softmax_scale_log2,
             softmax_scale,
             window_size_left,
             tiled_mma_qk,
             tiled_mma_pv,
             SharedStorage,
+            tile_sched_params,
+            TileScheduler,
             aux_tensors,
             fastdiv_mods,
         ).launch(
         mV: cute.Tensor,
         mO: cute.Tensor,
         mLSE: Optional[cute.Tensor],
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mCuSeqlensK: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        mSeqUsedK: Optional[cute.Tensor],
         softmax_scale_log2: Float32,
         softmax_scale: Optional[Float32],
         window_size_left: Optional[Int32],
         tiled_mma_qk: cute.TiledMma,
         tiled_mma_pv: cute.TiledMma,
         SharedStorage: cutlass.Constexpr,
+        tile_sched_params,
+        TileScheduler: cutlass.Constexpr[Callable],
         aux_tensors=None,
         fastdiv_mods=None,
     ):
         # Thread index, block index
         tidx, _, _ = cute.arch.thread_idx()
+        tile_scheduler = TileScheduler.create(tile_sched_params)
+        work_tile = tile_scheduler.initial_work_tile_info()
+        m_block, num_head, batch_size, _ = work_tile.tile_idx
         block_info = BlockInfo(
             self.tile_m,
             window_size_right,
             qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
+        seqlen = SeqlenInfoQK.create(
+            batch_idx=batch_size,
+            seqlen_q_static=mQ.shape[0],
+            seqlen_k_static=mK.shape[0],
+            mCuSeqlensQ=mCuSeqlensQ,
+            mCuSeqlensK=mCuSeqlensK,
+            mSeqUsedQ=mSeqUsedQ,
+            mSeqUsedK=mSeqUsedK,
+        )
         n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
+        # For varlen, wasted grid tiles (where batch_idx >= num_batch) will have
+        # seqlen_q=seqlen_k=0 and n_block_max=0.  Clamp to 0 so we don't use a
+        # negative block index for K/V loads; the load/store predicates already
+        # guard all memory accesses when seqlen is 0.
+        n_block = cutlass.max(n_block_max - 1, 0)
         # ///////////////////////////////////////////////////////////////////////////////
         # Get the appropriate tiles for this thread block.
         blkQ_shape = (self.tile_m, self.tile_hdim)
         blkK_shape = (self.tile_n, self.tile_hdim)
         blkV_shape = (self.tile_n, self.tile_hdimv)
         num_head_kv = num_head // self.qhead_per_kvhead
+        if const_expr(not seqlen.has_cu_seqlens_q):
+            mQ_cur = mQ[None, None, num_head, batch_size]
+        else:
+            mQ_cur = cute.domain_offset((seqlen.offset_q, 0), mQ[None, None, num_head])
+        if const_expr(not seqlen.has_cu_seqlens_k):
+            mK_cur = mK[None, None, num_head_kv, batch_size]
+            mV_cur = mV[None, None, num_head_kv, batch_size]
+        else:
+            mK_cur = cute.domain_offset((seqlen.offset_k, 0), mK[None, None, num_head_kv])
+            mV_cur = cute.domain_offset((seqlen.offset_k, 0), mV[None, None, num_head_kv])
+        gQ = cute.local_tile(mQ_cur, blkQ_shape, (m_block, 0))
+        gK = cute.local_tile(mK_cur, blkK_shape, (None, 0))
+        gV = cute.local_tile(mV_cur, blkV_shape, (None, 0))
         # ///////////////////////////////////////////////////////////////////////////////
         # Get shared memory buffer
         mask = AttentionMask(
             self.tile_m,
             self.tile_n,
+            seqlen,
             window_size_left,
             window_size_right,
             self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
         mask_fn = partial(
             mask.apply_mask,
+            batch_idx=batch_size,
+            head_idx=num_head,
             m_block=m_block,
             thr_mma=thr_mma_qk,
             mask_causal=self.is_causal,
             mask_local=self.is_local,
+            aux_tensors=aux_tensors,
             fastdiv_mods=fastdiv_mods if const_expr(self.mask_mod is not None) else None,
         )
             smem_pipe_read,
             smem_pipe_write,
             is_first_n_block=True,
+            seqlen=seqlen,
+            mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=True),
         )
         smem_pipe_read = self.advance_pipeline(smem_pipe_read)
         smem_pipe_write = self.advance_pipeline(smem_pipe_write)
                     n_block,
                     smem_pipe_read,
                     smem_pipe_write,
+                    seqlen=seqlen,
+                    mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=True),
                 )
                 smem_pipe_read = self.advance_pipeline(smem_pipe_read)
                 smem_pipe_write = self.advance_pipeline(smem_pipe_write)
         # The remaining iterations have no masking
         for n_tile in cutlass.range(n_block, unroll=1):
             compute_one_n_block(
+                n_block - n_tile - 1, smem_pipe_read, smem_pipe_write,
+                seqlen=seqlen, is_first_n_block=False,
+                mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False)
             )
             smem_pipe_read = self.advance_pipeline(smem_pipe_read)
             smem_pipe_write = self.advance_pipeline(smem_pipe_write)
         #     load_K_next()
+# SM90 forward pass moved to flash_fwd_sm90.py; re-export for backward compatibility
+def __getattr__(name):
+    if name == "FlashAttentionForwardSm90":
+        from .flash_fwd_sm90 import FlashAttentionForwardSm90
+        return FlashAttentionForwardSm90
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

build/torch-cuda/flash_fwd_combine.py CHANGED Viewed

@@ -10,7 +10,7 @@ import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 from cutlass.cute.nvgpu import cpasync
-from cutlass import Float32, Int32, const_expr
 from . import utils
 from .cute_dsl_utils import assume_tensor_aligned
@@ -24,7 +24,7 @@ class FlashAttentionForwardCombine:
         dtype: Type[cutlass.Numeric],
         dtype_partial: Type[cutlass.Numeric],
         head_dim: int,
-        m_block_size: int = 8,
         k_block_size: int = 64,
         log_max_splits: int = 4,
         num_threads: int = 256,
@@ -36,7 +36,7 @@ class FlashAttentionForwardCombine:
         :param dtype: output data type
         :param dtype_partial: partial accumulation data type
         :param head_dim: head dimension
-        :param m_block_size: m block size
         :param k_block_size: k block size
         :param log_max_splits: log2 of maximum splits
         :param num_threads: number of threads
@@ -46,7 +46,7 @@ class FlashAttentionForwardCombine:
         self.dtype = dtype
         self.dtype_partial = dtype_partial
         self.head_dim = head_dim
-        self.m_block_size = m_block_size
         self.k_block_size = k_block_size
         self.max_splits = 1 << log_max_splits
         self.num_threads = num_threads
@@ -58,7 +58,7 @@ class FlashAttentionForwardCombine:
         dtype,
         dtype_partial,
         head_dim,
-        m_block_size,
         k_block_size,
         log_max_splits,
         num_threads,
@@ -72,12 +72,12 @@ class FlashAttentionForwardCombine:
             return False
         if num_threads % 32 != 0:
             return False
-        if m_block_size % 8 != 0:
             return False
         max_splits = 1 << log_max_splits
         if max_splits > 256:
             return False
-        if (m_block_size * max_splits) % num_threads != 0:
             return False
         return True
@@ -124,15 +124,11 @@ class FlashAttentionForwardCombine:
         lse_copy_bits = Float32.width  # 1 element per copy, width is in bits
         m_block_smem = (
             128
-            if self.m_block_size % 128 == 0
             else (
                 64
-                if self.m_block_size % 64 == 0
-                else (
-                    32
-                    if self.m_block_size % 32 == 0
-                    else (16 if self.m_block_size % 16 == 0 else 8)
-                )
             )
         )
         gmem_threads_per_row_lse = m_block_smem
@@ -183,12 +179,12 @@ class FlashAttentionForwardCombine:
             smem_lse_swizzle, 0, cute.make_ordered_layout((8, m_block_smem), order=(1, 0))
         )
         self.smem_layout_lse = cute.tile_to_shape(
-            smem_layout_atom_lse, (self.max_splits, self.m_block_size), (0, 1)
         )
         # O partial shared memory layout (simple layout for pipeline stages)
         self.smem_layout_o = cute.make_ordered_layout(
-            (self.m_block_size, self.k_block_size, self.stages), order=(1, 0, 2)
         )
     @cute.jit
@@ -201,7 +197,9 @@ class FlashAttentionForwardCombine:
         cu_seqlens: Optional[cute.Tensor] = None,
         seqused: Optional[cute.Tensor] = None,
         num_splits_dynamic_ptr: Optional[cute.Tensor] = None,
         semaphore_to_reset: Optional[cute.Tensor] = None,
         stream: cuda.CUstream = None,
     ):
         # Type checking
@@ -269,7 +267,7 @@ class FlashAttentionForwardCombine:
             sLSE: cute.struct.Align[
                 cute.struct.MemRange[Float32, cute.cosize(self.smem_layout_lse)], 128
             ]
-            sMaxValidSplit: cute.struct.Align[cute.struct.MemRange[Int32, self.m_block_size], 128]
             sO: cute.struct.Align[
                 cute.struct.MemRange[self.dtype_partial, cute.cosize(self.smem_layout_o)], 128
             ]
@@ -290,7 +288,7 @@ class FlashAttentionForwardCombine:
         head_divmod = FastDivmodDivisor(num_head)
         grid_dim = (
-            cute.ceil_div(seqlen * num_head, self.m_block_size),
             cute.ceil_div(self.head_dim, self.k_block_size),
             batch_size,
         )
@@ -303,6 +301,7 @@ class FlashAttentionForwardCombine:
             cu_seqlens,
             seqused,
             num_splits_dynamic_ptr,
             semaphore_to_reset,
             SharedStorage,
             self.smem_layout_lse,
@@ -331,6 +330,7 @@ class FlashAttentionForwardCombine:
         cu_seqlens: Optional[cute.Tensor],
         seqused: Optional[cute.Tensor],
         num_splits_dynamic_ptr: Optional[cute.Tensor],
         semaphore_to_reset: Optional[cute.Tensor],
         SharedStorage: cutlass.Constexpr,
         smem_layout_lse: cute.Layout | cute.ComposedLayout,
@@ -345,7 +345,14 @@ class FlashAttentionForwardCombine:
     ):
         # Thread and block indices
         tidx, _, _ = cute.arch.thread_idx()
-        m_block, k_block, batch_idx = cute.arch.block_idx()
         # ///////////////////////////////////////////////////////////////////////////////
         # Get shared memory buffer
@@ -353,22 +360,23 @@ class FlashAttentionForwardCombine:
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(SharedStorage)
         sLSE = storage.sLSE.get_tensor(smem_layout_lse)
-        sMaxValidSplit = storage.sMaxValidSplit.get_tensor((self.m_block_size,))
         sO = storage.sO.get_tensor(smem_layout_o)
-        # Handle semaphore reset
         if const_expr(semaphore_to_reset is not None):
             if (
                 tidx == 0
                 and m_block == cute.arch.grid_dim()[0] - 1
                 and k_block == cute.arch.grid_dim()[1] - 1
-                and batch_idx == cute.arch.grid_dim()[2] - 1
             ):
                 semaphore_to_reset[0] = 0
-        # Get number of splits
         num_splits = (
-            num_splits_dynamic_ptr[batch_idx]
             if const_expr(num_splits_dynamic_ptr is not None)
             else mLSE_partial.shape[1]
         )
@@ -378,6 +386,7 @@ class FlashAttentionForwardCombine:
             seqlen_static=mO_partial.shape[0],
             cu_seqlens=cu_seqlens,
             seqused=seqused,
         )
         seqlen, offset = seqlen_info.seqlen, seqlen_info.offset
@@ -387,29 +396,27 @@ class FlashAttentionForwardCombine:
         # Early exit for single split if dynamic
         if (const_expr(num_splits_dynamic_ptr is None) or num_splits > 1) and (
-            const_expr(not varlen) or m_block * self.m_block_size < max_idx
         ):
             # ===============================
             # Step 1: Load LSE_partial from gmem to shared memory
             # ===============================
-            if const_expr(cu_seqlens is None):
-                mLSE_partial_cur = mLSE_partial[None, None, None, batch_idx]
-            else:
-                mLSE_partial_cur = cute.domain_offset((offset, 0, 0), mLSE_partial)
             mLSE_partial_copy = cute.tiled_divide(mLSE_partial_cur, (1,))
             gmem_thr_copy_LSE = gmem_tiled_copy_LSE.get_slice(tidx)
             tLSEsLSE = gmem_thr_copy_LSE.partition_D(sLSE)
             # Create identity tensor for coordinate tracking
-            cLSE = cute.make_identity_tensor((self.max_splits, self.m_block_size))
             tLSEcLSE = gmem_thr_copy_LSE.partition_S(cLSE)
             # Load LSE partial values
             for m in cutlass.range(cute.size(tLSEcLSE, mode=[2]), unroll_full=True):
                 mi = tLSEcLSE[0, 0, m][1]  # Get m coordinate
-                idx = m_block * self.m_block_size + mi
                 if idx < max_idx:
                     # Calculate actual sequence position and head using FastDivmodDivisor
                     if const_expr(not varlen):
@@ -436,22 +443,19 @@ class FlashAttentionForwardCombine:
             # ===============================
             gmem_thr_copy_O_partial = gmem_tiled_copy_O_partial.get_slice(tidx)
-            cO = cute.make_identity_tensor((self.m_block_size, self.k_block_size))
             tOcO = gmem_thr_copy_O_partial.partition_D(cO)
             tOsO_partial = gmem_thr_copy_O_partial.partition_D(sO)
-            if const_expr(cu_seqlens is None):
-                mO_partial_cur = mO_partial[None, None, None, None, batch_idx]
-            else:
-                mO_partial_cur = cute.domain_offset((offset, 0, 0, 0), mO_partial)
             # Precompute these values to avoid recomputing them in the loop
             num_rows = const_expr(cute.size(tOcO, mode=[1]))
-            tOmidx = cute.make_fragment(num_rows, cutlass.Int32)
-            tOhidx = cute.make_fragment(num_rows, cutlass.Int32)
-            tOrOptr = cute.make_fragment(num_rows, cutlass.Int64)
             for m in cutlass.range(num_rows, unroll_full=True):
                 mi = tOcO[0, m, 0][0]  # m coordinate
-                idx = m_block * self.m_block_size + mi
                 if const_expr(not varlen):
                     tOhidx[m], tOmidx[m] = divmod(idx, seqlen_divmod)
                 else:
@@ -463,11 +467,12 @@ class FlashAttentionForwardCombine:
                 if idx >= max_idx:
                     tOhidx[m] = -1
-            tOpO = cute.make_fragment(cute.size(tOcO, [2]), cutlass.Boolean)
             if const_expr(not self.is_even_k):
                 for k in cutlass.range(cute.size(tOpO), unroll_full=True):
                     tOpO[k] = tOcO[0, 0, k][1] < mO_partial.shape[1] - k_block * self.k_block_size
-            # if cute.arch.thread_idx()[0] == 0 and k_block == 1: cute.print_tensor(tOpO)
             load_O_partial = partial(
                 self.load_O_partial,
@@ -501,17 +506,17 @@ class FlashAttentionForwardCombine:
             s2r_thr_copy_LSE = s2r_tiled_copy_LSE.get_slice(tidx)
             ts2rsLSE = s2r_thr_copy_LSE.partition_S(sLSE)
-            ts2rrLSE = cute.make_fragment_like(ts2rsLSE)
             cute.copy(s2r_tiled_copy_LSE, ts2rsLSE, ts2rrLSE)
             # ===============================
             # Step 4: Compute final LSE along split dimension
             # ===============================
-            lse_sum = cute.make_fragment(cute.size(ts2rrLSE, mode=[2]), Float32)
             ts2rcLSE = s2r_thr_copy_LSE.partition_D(cLSE)
             # We compute the max valid split for each row to short-circuit the computation later
-            max_valid_split = cute.make_fragment(cute.size(ts2rrLSE, mode=[2]), Int32)
             assert cute.size(ts2rrLSE, mode=[0]) == 1
             # Compute max, scales, and final LSE for each row
             for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
@@ -561,7 +566,7 @@ class FlashAttentionForwardCombine:
             for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
                 if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
                     mi = ts2rcLSE[0, 0, m][1]
-                    if mi < self.m_block_size:
                         sMaxValidSplit[mi] = max_valid_split[m]
             # ===============================
@@ -577,7 +582,7 @@ class FlashAttentionForwardCombine:
                     for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
                         if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
                             mi = ts2rcLSE[0, 0, m][1]
-                            idx = m_block * self.m_block_size + mi
                             if idx < max_idx:
                                 if const_expr(not varlen):
                                     head_idx, m_idx = divmod(idx, seqlen_divmod)
@@ -594,11 +599,11 @@ class FlashAttentionForwardCombine:
             # Get max valid split for this thread
             thr_max_valid_split = sMaxValidSplit[tOcO[0, 0, 0][0]]
-            for m in cutlass.range(1, cute.size(tOcO, mode=[1])):
                 thr_max_valid_split = max(thr_max_valid_split, sMaxValidSplit[tOcO[0, m, 0][0]])
-            tOrO_partial = cute.make_fragment_like(tOsO_partial[None, None, None, 0])
-            tOrO = cute.make_fragment_like(tOrO_partial, Float32)
             tOrO.fill(0.0)
             stage_load = self.stages - 1
@@ -607,7 +612,7 @@ class FlashAttentionForwardCombine:
             # Main accumulation loop
             for s in cutlass.range(thr_max_valid_split + 1, unroll=4):
                 # Get scales for this split
-                scale = cute.make_fragment(num_rows, Float32)
                 for m in cutlass.range(num_rows, unroll_full=True):
                     scale[m] = sLSE[s, tOcO[0, m, 0][0]]  # Get scale from smem
@@ -637,8 +642,9 @@ class FlashAttentionForwardCombine:
             # Step 7: Write final O to gmem
             # ===============================
-            rO = cute.make_fragment_like(tOrO, self.dtype)
             rO.store(tOrO.load().to(self.dtype))
             if const_expr(cu_seqlens is None):
                 mO_cur = mO[None, None, None, batch_idx]
             else:
@@ -665,7 +671,7 @@ class FlashAttentionForwardCombine:
         tOrOptr: cute.Tensor,
         tOsO_partial: cute.Tensor,
         tOhidx: cute.Tensor,
-        tOpO: cute.Tensor,
         tOcO: cute.Tensor,
         mO_cur_partial_layout: cute.Layout,
         split: Int32,
@@ -684,7 +690,7 @@ class FlashAttentionForwardCombine:
                 mO_partial_cur_copy = cute.tiled_divide(mO_partial_cur, (elems_per_load,))
                 for k in cutlass.range(cute.size(tOcO, mode=[2]), unroll_full=True):
                     k_idx = tOcO[0, 0, k][1] // elems_per_load
-                    if const_expr(self.is_even_k) or tOpO[k]:
                         cute.copy(
                             gmem_tiled_copy_O_partial,
                             mO_partial_cur_copy[None, k_idx, split],

 import cutlass
 import cutlass.cute as cute
 from cutlass.cute.nvgpu import cpasync
+from cutlass import Float32, Int32, Boolean, const_expr
 from . import utils
 from .cute_dsl_utils import assume_tensor_aligned
         dtype: Type[cutlass.Numeric],
         dtype_partial: Type[cutlass.Numeric],
         head_dim: int,
+        tile_m: int = 8,
         k_block_size: int = 64,
         log_max_splits: int = 4,
         num_threads: int = 256,
         :param dtype: output data type
         :param dtype_partial: partial accumulation data type
         :param head_dim: head dimension
+        :param tile_m: m block size
         :param k_block_size: k block size
         :param log_max_splits: log2 of maximum splits
         :param num_threads: number of threads
         self.dtype = dtype
         self.dtype_partial = dtype_partial
         self.head_dim = head_dim
+        self.tile_m = tile_m
         self.k_block_size = k_block_size
         self.max_splits = 1 << log_max_splits
         self.num_threads = num_threads
         dtype,
         dtype_partial,
         head_dim,
+        tile_m,
         k_block_size,
         log_max_splits,
         num_threads,
             return False
         if num_threads % 32 != 0:
             return False
+        if tile_m % 8 != 0:
             return False
         max_splits = 1 << log_max_splits
         if max_splits > 256:
             return False
+        if (tile_m * max_splits) % num_threads != 0:
             return False
         return True
         lse_copy_bits = Float32.width  # 1 element per copy, width is in bits
         m_block_smem = (
             128
+            if self.tile_m % 128 == 0
             else (
                 64
+                if self.tile_m % 64 == 0
+                else (32 if self.tile_m % 32 == 0 else (16 if self.tile_m % 16 == 0 else 8))
             )
         )
         gmem_threads_per_row_lse = m_block_smem
             smem_lse_swizzle, 0, cute.make_ordered_layout((8, m_block_smem), order=(1, 0))
         )
         self.smem_layout_lse = cute.tile_to_shape(
+            smem_layout_atom_lse, (self.max_splits, self.tile_m), (0, 1)
         )
         # O partial shared memory layout (simple layout for pipeline stages)
         self.smem_layout_o = cute.make_ordered_layout(
+            (self.tile_m, self.k_block_size, self.stages), order=(1, 0, 2)
         )
     @cute.jit
         cu_seqlens: Optional[cute.Tensor] = None,
         seqused: Optional[cute.Tensor] = None,
         num_splits_dynamic_ptr: Optional[cute.Tensor] = None,
+        varlen_batch_idx: Optional[cute.Tensor] = None,
         semaphore_to_reset: Optional[cute.Tensor] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
         stream: cuda.CUstream = None,
     ):
         # Type checking
             sLSE: cute.struct.Align[
                 cute.struct.MemRange[Float32, cute.cosize(self.smem_layout_lse)], 128
             ]
+            sMaxValidSplit: cute.struct.Align[cute.struct.MemRange[Int32, self.tile_m], 128]
             sO: cute.struct.Align[
                 cute.struct.MemRange[self.dtype_partial, cute.cosize(self.smem_layout_o)], 128
             ]
         head_divmod = FastDivmodDivisor(num_head)
         grid_dim = (
+            cute.ceil_div(seqlen * num_head, self.tile_m),
             cute.ceil_div(self.head_dim, self.k_block_size),
             batch_size,
         )
             cu_seqlens,
             seqused,
             num_splits_dynamic_ptr,
+            varlen_batch_idx,
             semaphore_to_reset,
             SharedStorage,
             self.smem_layout_lse,
         cu_seqlens: Optional[cute.Tensor],
         seqused: Optional[cute.Tensor],
         num_splits_dynamic_ptr: Optional[cute.Tensor],
+        varlen_batch_idx: Optional[cute.Tensor],
         semaphore_to_reset: Optional[cute.Tensor],
         SharedStorage: cutlass.Constexpr,
         smem_layout_lse: cute.Layout | cute.ComposedLayout,
     ):
         # Thread and block indices
         tidx, _, _ = cute.arch.thread_idx()
+        m_block, k_block, maybe_virtual_batch = cute.arch.block_idx()
+        # Map virtual batch index to real batch index (for persistent tile schedulers)
+        batch_idx = (
+            varlen_batch_idx[maybe_virtual_batch]
+            if const_expr(varlen_batch_idx is not None)
+            else maybe_virtual_batch
+        )
         # ///////////////////////////////////////////////////////////////////////////////
         # Get shared memory buffer
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(SharedStorage)
         sLSE = storage.sLSE.get_tensor(smem_layout_lse)
+        sMaxValidSplit = storage.sMaxValidSplit.get_tensor((self.tile_m,))
         sO = storage.sO.get_tensor(smem_layout_o)
+        # Handle semaphore reset — wait for dependent grids first
         if const_expr(semaphore_to_reset is not None):
             if (
                 tidx == 0
                 and m_block == cute.arch.grid_dim()[0] - 1
                 and k_block == cute.arch.grid_dim()[1] - 1
+                and maybe_virtual_batch == cute.arch.grid_dim()[2] - 1
             ):
+                cute.arch.griddepcontrol_wait()
                 semaphore_to_reset[0] = 0
+        # Get number of splits (use maybe_virtual_batch for per-batch-slot splits)
         num_splits = (
+            num_splits_dynamic_ptr[maybe_virtual_batch]
             if const_expr(num_splits_dynamic_ptr is not None)
             else mLSE_partial.shape[1]
         )
             seqlen_static=mO_partial.shape[0],
             cu_seqlens=cu_seqlens,
             seqused=seqused,
+            # Don't need to pass in tile size since we won't use offset_padded
         )
         seqlen, offset = seqlen_info.seqlen, seqlen_info.offset
         # Early exit for single split if dynamic
         if (const_expr(num_splits_dynamic_ptr is None) or num_splits > 1) and (
+            const_expr(not varlen) or m_block * self.tile_m < max_idx
         ):
+            # Wait for dependent grids (e.g., the main attention kernel that produces O_partial/LSE_partial)
+            cute.arch.griddepcontrol_wait()
             # ===============================
             # Step 1: Load LSE_partial from gmem to shared memory
             # ===============================
+            mLSE_partial_cur = seqlen_info.offset_batch(mLSE_partial, batch_idx, dim=3)
             mLSE_partial_copy = cute.tiled_divide(mLSE_partial_cur, (1,))
             gmem_thr_copy_LSE = gmem_tiled_copy_LSE.get_slice(tidx)
             tLSEsLSE = gmem_thr_copy_LSE.partition_D(sLSE)
             # Create identity tensor for coordinate tracking
+            cLSE = cute.make_identity_tensor((self.max_splits, self.tile_m))
             tLSEcLSE = gmem_thr_copy_LSE.partition_S(cLSE)
             # Load LSE partial values
             for m in cutlass.range(cute.size(tLSEcLSE, mode=[2]), unroll_full=True):
                 mi = tLSEcLSE[0, 0, m][1]  # Get m coordinate
+                idx = m_block * self.tile_m + mi
                 if idx < max_idx:
                     # Calculate actual sequence position and head using FastDivmodDivisor
                     if const_expr(not varlen):
             # ===============================
             gmem_thr_copy_O_partial = gmem_tiled_copy_O_partial.get_slice(tidx)
+            cO = cute.make_identity_tensor((self.tile_m, self.k_block_size))
             tOcO = gmem_thr_copy_O_partial.partition_D(cO)
             tOsO_partial = gmem_thr_copy_O_partial.partition_D(sO)
+            mO_partial_cur = seqlen_info.offset_batch(mO_partial, batch_idx, dim=4)
             # Precompute these values to avoid recomputing them in the loop
             num_rows = const_expr(cute.size(tOcO, mode=[1]))
+            tOmidx = cute.make_rmem_tensor(num_rows, cutlass.Int32)
+            tOhidx = cute.make_rmem_tensor(num_rows, cutlass.Int32)
+            tOrOptr = cute.make_rmem_tensor(num_rows, cutlass.Int64)
             for m in cutlass.range(num_rows, unroll_full=True):
                 mi = tOcO[0, m, 0][0]  # m coordinate
+                idx = m_block * self.tile_m + mi
                 if const_expr(not varlen):
                     tOhidx[m], tOmidx[m] = divmod(idx, seqlen_divmod)
                 else:
                 if idx >= max_idx:
                     tOhidx[m] = -1
+            tOpO = None
             if const_expr(not self.is_even_k):
+                tOpO = cute.make_rmem_tensor(cute.size(tOcO, mode=[2]), Boolean)
                 for k in cutlass.range(cute.size(tOpO), unroll_full=True):
                     tOpO[k] = tOcO[0, 0, k][1] < mO_partial.shape[1] - k_block * self.k_block_size
+                # if cute.arch.thread_idx()[0] == 0 and k_block == 1: cute.print_tensor(tOpO)
             load_O_partial = partial(
                 self.load_O_partial,
             s2r_thr_copy_LSE = s2r_tiled_copy_LSE.get_slice(tidx)
             ts2rsLSE = s2r_thr_copy_LSE.partition_S(sLSE)
+            ts2rrLSE = cute.make_rmem_tensor_like(ts2rsLSE)
             cute.copy(s2r_tiled_copy_LSE, ts2rsLSE, ts2rrLSE)
             # ===============================
             # Step 4: Compute final LSE along split dimension
             # ===============================
+            lse_sum = cute.make_rmem_tensor(cute.size(ts2rrLSE, mode=[2]), Float32)
             ts2rcLSE = s2r_thr_copy_LSE.partition_D(cLSE)
             # We compute the max valid split for each row to short-circuit the computation later
+            max_valid_split = cute.make_rmem_tensor(cute.size(ts2rrLSE, mode=[2]), Int32)
             assert cute.size(ts2rrLSE, mode=[0]) == 1
             # Compute max, scales, and final LSE for each row
             for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
             for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
                 if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
                     mi = ts2rcLSE[0, 0, m][1]
+                    if mi < self.tile_m:
                         sMaxValidSplit[mi] = max_valid_split[m]
             # ===============================
                     for m in cutlass.range(cute.size(ts2rrLSE, mode=[2]), unroll_full=True):
                         if ts2rcLSE[0, 0, m][0] == 0:  # Only thread responsible for s=0 writes
                             mi = ts2rcLSE[0, 0, m][1]
+                            idx = m_block * self.tile_m + mi
                             if idx < max_idx:
                                 if const_expr(not varlen):
                                     head_idx, m_idx = divmod(idx, seqlen_divmod)
             # Get max valid split for this thread
             thr_max_valid_split = sMaxValidSplit[tOcO[0, 0, 0][0]]
+            for m in cutlass.range(1, cute.size(tOcO, mode=[1]), unroll_full=True):
                 thr_max_valid_split = max(thr_max_valid_split, sMaxValidSplit[tOcO[0, m, 0][0]])
+            tOrO_partial = cute.make_rmem_tensor_like(tOsO_partial[None, None, None, 0])
+            tOrO = cute.make_rmem_tensor_like(tOrO_partial, Float32)
             tOrO.fill(0.0)
             stage_load = self.stages - 1
             # Main accumulation loop
             for s in cutlass.range(thr_max_valid_split + 1, unroll=4):
                 # Get scales for this split
+                scale = cute.make_rmem_tensor(num_rows, Float32)
                 for m in cutlass.range(num_rows, unroll_full=True):
                     scale[m] = sLSE[s, tOcO[0, m, 0][0]]  # Get scale from smem
             # Step 7: Write final O to gmem
             # ===============================
+            rO = cute.make_rmem_tensor_like(tOrO, self.dtype)
             rO.store(tOrO.load().to(self.dtype))
+            mO_cur = seqlen_info.offset_batch(mO, batch_idx, dim=3)
             if const_expr(cu_seqlens is None):
                 mO_cur = mO[None, None, None, batch_idx]
             else:
         tOrOptr: cute.Tensor,
         tOsO_partial: cute.Tensor,
         tOhidx: cute.Tensor,
+        tOpO: Optional[cute.Tensor],
         tOcO: cute.Tensor,
         mO_cur_partial_layout: cute.Layout,
         split: Int32,
                 mO_partial_cur_copy = cute.tiled_divide(mO_partial_cur, (elems_per_load,))
                 for k in cutlass.range(cute.size(tOcO, mode=[2]), unroll_full=True):
                     k_idx = tOcO[0, 0, k][1] // elems_per_load
+                    if const_expr(tOpO is None) or tOpO[k]:
                         cute.copy(
                             gmem_tiled_copy_O_partial,
                             mO_partial_cur_copy[None, k_idx, split],

build/torch-cuda/flash_fwd_sm100.py CHANGED Viewed

@@ -13,9 +13,8 @@
 # https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/fmha.py
-import enum
 import math
-from typing import Type, Tuple, Callable, Optional, Literal
 from functools import partial
 import cuda.bindings.driver as cuda
@@ -28,6 +27,7 @@ import cutlass.cute.nvgpu.tcgen05 as tcgen05
 import cutlass.utils.blackwell_helpers as sm100_utils_basic
 from cutlass import pipeline
 from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
 from cutlass.base_dsl.arch import Arch
 from cutlass.cutlass_dsl import BaseDSL
@@ -35,7 +35,9 @@ from .quack import copy_utils, layout_utils
 from .paged_kv import PagedKVManager
 from .cute_dsl_utils import assume_tensor_aligned
 from . import pipeline as pipeline_custom
 from .mask import AttentionMask
 from .softmax import SoftmaxSm100, apply_score_mod_inner
 from .seqlen_info import SeqlenInfoQK
@@ -47,33 +49,45 @@ from .block_sparse_utils import (
     softmax_block_sparse_sm100,
     handle_block_sparse_empty_tile_correction_sm100,
 )
-from .pack_gqa import PackGQA
 from . import mma_sm100_desc as sm100_desc
 from . import blackwell_helpers as sm100_utils
 from cutlass.cute import FastDivmodDivisor
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import (
     TileSchedulerArguments,
     SingleTileScheduler,
     StaticPersistentTileScheduler,
     SingleTileLPTScheduler,
     SingleTileVarlenScheduler,
 )
-class NamedBarrierFwd(enum.IntEnum):
-    Epilogue = enum.auto()  # starts from 1 as barrier 0 is reserved for sync_threads()
-    TmemPtr = enum.auto()
-    SoftmaxStatsW0 = enum.auto()
-    SoftmaxStatsW1 = enum.auto()
-    SoftmaxStatsW2 = enum.auto()
-    SoftmaxStatsW3 = enum.auto()
-    SoftmaxStatsW4 = enum.auto()
-    SoftmaxStatsW5 = enum.auto()
-    SoftmaxStatsW6 = enum.auto()
-    SoftmaxStatsW7 = enum.auto()
-#     WarpSchedulerWG1 = enum.auto()
-#     WarpSchedulerWG2 = enum.auto()
 class FlashAttentionForwardSm100:
@@ -99,6 +113,7 @@ class FlashAttentionForwardSm100:
         paged_kv_non_tma: bool = False,
         is_varlen_q: bool = False,
         use_2cta_instrs: bool = False,
     ):
         self.use_tma_KV = not paged_kv_non_tma
         # self.dtype = dtype
@@ -145,10 +160,6 @@ class FlashAttentionForwardSm100:
         self.is_split_kv = is_split_kv
         self.pack_gqa = pack_gqa
         self.q_subtile_factor = q_subtile_factor
-        if pack_gqa:
-            assert m_block_size % self.qhead_per_kvhead == 0, (
-                "For PackGQA, m_block_size must be divisible by qhead_per_kvhead"
-            )
         assert not (self.is_split_kv and self.head_dim_v_padded >= 192), (
             "SplitKV is not supported for hdim >= 192"
         )
@@ -160,8 +171,10 @@ class FlashAttentionForwardSm100:
         # Does S1 need to wait for S0 to finish
         # self.s0_s1_barrier = self.head_dim_padded in [64, 96] and (not self.is_causal and not self.is_local)
         is_sm103 = self.arch >= Arch.sm_103 and self.arch <= Arch.sm_103f
-        # self.enable_ex2_emu = self.head_dim_padded <= 128 and not is_sm103
-        self.enable_ex2_emu = (self.head_dim_padded <= 128 or (self.head_dim_padded == 192 and self.use_2cta_instrs and not self.is_causal and not self.is_local)) and not is_sm103
         self.s0_s1_barrier = False
         self.overlap_sO_sQ = (
             (self.head_dim_padded == 192 and self.head_dim_v_padded >= 64) or
@@ -174,6 +187,32 @@ class FlashAttentionForwardSm100:
             "Paged KV does not support irregular head dim"
         )
         self.softmax0_warp_ids = (0, 1, 2, 3)
         self.softmax1_warp_ids = (4, 5, 6, 7)
         self.correction_warp_ids = (8, 9, 10, 11)
@@ -195,8 +234,10 @@ class FlashAttentionForwardSm100:
             )
         )
         if self.q_stage == 1:
-            if not self.use_tma_KV:
                 self.empty_warp_ids = self.empty_warp_ids + self.load_warp_ids
                 self.load_warp_ids = self.softmax1_warp_ids
             else:
@@ -212,6 +253,8 @@ class FlashAttentionForwardSm100:
         elif self.is_varlen_q: # fallback
             self.epilogue_warp_ids = (13, 14)
         self.tmem_s_offset = [0, self.n_block_size]  # e.g., 0, 128
         self.tmem_o_offset = [
             self.tmem_s_offset[-1] + self.n_block_size + i * self.head_dim_v_padded
@@ -227,31 +270,26 @@ class FlashAttentionForwardSm100:
         # vec buffer for row_max & row_sum
         self.tmem_vec_offset = self.tmem_s_offset
         if self.head_dim_padded < 96:
             self.num_regs_softmax = 200 if not paged_kv_non_tma else 184
             self.num_regs_correction = 64
             self.num_regs_other = 48 if not paged_kv_non_tma else 80
         else:
-            # self.num_regs_softmax = 192 if self.is_causal or self.is_local else 184
-            if not self.enable_ex2_emu:
-                self.num_regs_softmax = 192 if not paged_kv_non_tma else 184
             else:
-                # self.num_regs_softmax = 200 if not paged_kv_non_tma else 184
-                self.num_regs_softmax = 192 if not paged_kv_non_tma else 184
-            # self.num_regs_softmax = 176
-            # self.num_regs_correction = 96
-            # self.num_regs_correction = 64 if self.is_causal or self.is_local else 80
-            if not self.enable_ex2_emu:
-                self.num_regs_correction = 80 if not paged_kv_non_tma else 64
-            else:
-                # self.num_regs_correction = 64
-                self.num_regs_correction = 80 if not paged_kv_non_tma else 64
-            # self.num_regs_other = 32
-            # self.num_regs_other = 64
-            # self.num_regs_other = 80
-            self.num_regs_other = 48 if not paged_kv_non_tma else 80
-            # self.num_regs_other = 96 if self.is_causal or self.is_local else 80
-            # self.num_regs_other = 64 if self.is_causal or self.is_local else 80
         self.buffer_align_bytes = 1024
@@ -289,7 +327,7 @@ class FlashAttentionForwardSm100:
             self.head_dim_padded == 192 and self.head_dim_v_padded == 128 and self.kv_stage == 3
         )
         self.uneven_kv_smem_offset = (
-            self.m_block_size * (self.head_dim_padded - self.head_dim_v_padded) // 2
             if self.uneven_kv_smem
             else 0
         )
@@ -304,7 +342,6 @@ class FlashAttentionForwardSm100:
         mO: cute.Tensor,  # (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
         mLSE: Optional[cute.Tensor],
         softmax_scale: Float32,
-        stream: cuda.CUstream,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
@@ -315,6 +352,8 @@ class FlashAttentionForwardSm100:
         learnable_sink: Optional[cute.Tensor] = None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         aux_tensors: Optional[list] = None,
     ):
         """Execute the Fused Multi-Head Attention operation on the provided tensors.
@@ -367,22 +406,21 @@ class FlashAttentionForwardSm100:
         if const_expr(self.q_dtype != self.v_dtype):
             raise TypeError(f"Type mismatch: {self.q_dtype} != {self.v_dtype}")
         self._setup_attributes()
-        self.use_tma_O = self.arch >= Arch.sm_90 and mCuSeqlensQ is None and mSeqUsedQ is None
-        # This can be tuned
-        # This is currently very ad-hoc, we should tune it systematically
         self.ex2_emu_freq = 0
-        # self.ex2_emu_start_frg = 1 if self.is_causal else 0
-        self.ex2_emu_start_frg = 1
         if const_expr(self.enable_ex2_emu):
-            self.ex2_emu_freq = 16
-            if const_expr(self.head_dim_padded == 128 and self.use_2cta_instrs):
-                self.ex2_emu_freq = 12
             if const_expr(
                 self.pack_gqa and self.head_dim_padded > 64 and not self.is_causal and not self.is_local
             ):
-                self.ex2_emu_freq = 32 if mCuSeqlensQ is not None or mSeqUsedQ is not None else 10
-            if const_expr(self.head_dim_padded > 64 and self.is_causal):
-                self.ex2_emu_freq = 10
         cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
         q_major_mode = tcgen05.OperandMajorMode.K
@@ -462,50 +500,11 @@ class FlashAttentionForwardSm100:
             )
         if const_expr(self.pack_gqa):
-            shape_Q_packed = (
-                (self.qhead_per_kvhead, mQ.shape[0]),
-                mQ.shape[1],
-                mK.shape[2],
-                *mQ.shape[3:],
-            )
-            stride_Q_packed = (
-                (mQ.stride[2], mQ.stride[0]),
-                mQ.stride[1],
-                mQ.stride[2] * self.qhead_per_kvhead,
-                *mQ.stride[3:],
-            )
-            mQ = cute.make_tensor(
-                mQ.iterator, cute.make_layout(shape_Q_packed, stride=stride_Q_packed)
-            )
-            shape_O_packed = (
-                (self.qhead_per_kvhead, mO.shape[0]),
-                mO.shape[1],
-                mK.shape[2],
-                *mO.shape[3:],
-            )
-            stride_O_packed = (
-                (mO.stride[2], mO.stride[0]),
-                mO.stride[1],
-                mO.stride[2] * self.qhead_per_kvhead,
-                *mO.stride[3:],
-            )
-            mO = cute.make_tensor(
-                mO.iterator, cute.make_layout(shape_O_packed, stride=stride_O_packed)
-            )
             if const_expr(mLSE is not None):
-                shape_LSE_packed = (
-                    (self.qhead_per_kvhead, mLSE.shape[0]),
-                    mK.shape[2],
-                    *mLSE.shape[2:],
-                )
-                stride_LSE_packed = (
-                    (mLSE.stride[1], mLSE.stride[0]),
-                    mLSE.stride[1] * self.qhead_per_kvhead,
-                    *mLSE.stride[2:],
-                )
-                mLSE = cute.make_tensor(
-                    mLSE.iterator, cute.make_layout(shape_LSE_packed, stride=stride_LSE_packed)
-                )
         self.tma_copy_bytes = {
             name: cute.size_in_bytes(mX.element_type, cute.select(layout, mode=[0, 1, 2]))
@@ -522,14 +521,24 @@ class FlashAttentionForwardSm100:
         tma_load_op = cpasync.CopyBulkTensorTileG2SOp(cta_group)
         tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
-        tma_atom_Q, mQ = cute.nvgpu.make_tiled_tma_atom_A(
-            tma_load_op,
-            mQ,
-            cute.select(sQ_layout, mode=[0, 1, 2]),
-            self.mma_tiler_qk,
-            tiled_mma_qk,
-            cta_layout_vmnk.shape,
-        )
         tma_atom_K = None
         tma_atom_V = None
@@ -578,19 +587,10 @@ class FlashAttentionForwardSm100:
             vO_layout = cute.make_layout((1, async_copy_elems))
             gmem_tiled_copy_O = cute.make_tiled_copy_tv(atom_universal_copy, tO_layout, vO_layout)
-        if const_expr(mCuSeqlensQ is not None or mSeqUsedQ is not None):
-            TileScheduler = SingleTileVarlenScheduler
-        else:
-            if const_expr(self.is_causal or self.is_local):
-                TileScheduler = SingleTileLPTScheduler
-            else:
-                TileScheduler = (
-                    SingleTileScheduler
-                    if const_expr(not self.is_persistent)
-                    else StaticPersistentTileScheduler
-                )
         tile_sched_args = TileSchedulerArguments(
-            cute.ceil_div(cute.size(mQ.shape[0]), self.cta_tiler[0]),
             cute.size(mQ.shape[2]),
             cute.size(mQ.shape[3])
             if const_expr(mCuSeqlensQ is None)
@@ -613,8 +613,11 @@ class FlashAttentionForwardSm100:
             lpt=self.is_causal or self.is_local,
             is_split_kv=self.is_split_kv,
             cluster_shape_mn=self.cluster_shape_mn,
         )
-        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
         self.tile_scheduler_cls = TileScheduler
         grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
@@ -624,6 +627,9 @@ class FlashAttentionForwardSm100:
             cutlass.max(cute.cosize(sQ_layout), cute.cosize(sO_layout) * self.o_dtype.width // self.q_dtype.width)
         )
         @cute.struct
         class SharedStorage:
             # m_barriers for pipelines
@@ -643,6 +649,13 @@ class FlashAttentionForwardSm100:
             # Smem tensors
             # store row max and row sum
             sScale: cute.struct.MemRange[Float32, self.q_stage * self.m_block_size * 2]
             sO: cute.struct.Align[
                 cute.struct.MemRange[self.o_dtype, sO_size], self.buffer_align_bytes
             ]
@@ -657,35 +670,10 @@ class FlashAttentionForwardSm100:
         self.shared_storage = SharedStorage
-        LOG2_E = math.log2(math.e)
-        if const_expr(self.score_mod is None):
-            softmax_scale_log2 = softmax_scale * LOG2_E
-            softmax_scale = None
-        else:
-            # NB: If a users passes in a score mod, we want to apply the score-mod in the sm_scaled qk
-            # But in the original base 10. We hijack softmax_scale_log2 to just be the change of base
-            # and correctly apply the softmax_scale prior to score_mod in the softmax step
-            softmax_scale_log2 = LOG2_E
-            softmax_scale = softmax_scale
-        if const_expr(window_size_left is not None):
-            window_size_left = Int32(window_size_left)
-        if const_expr(window_size_right is not None):
-            window_size_right = Int32(window_size_right)
-        fastdiv_mods = None
-        if cutlass.const_expr(aux_tensors is not None):
-            seqlen_q = cute.size(mQ.shape[0]) // (
-                self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1
-            )
-            seqlen_k = (
-                cute.size(mK.shape[0])
-                if const_expr(mPageTable is None)
-                else mK.shape[0] * mPageTable.shape[1]
-            )
-            seqlen_q_divmod = FastDivmodDivisor(seqlen_q)
-            seqlen_k_divmod = FastDivmodDivisor(seqlen_k)
-            fastdiv_mods = (seqlen_q_divmod, seqlen_k_divmod)
         head_divmod = None
         if cutlass.const_expr(self.pack_gqa):
@@ -722,6 +710,7 @@ class FlashAttentionForwardSm100:
             tP_layout,
             sV_layout,
             sO_layout,
             gmem_tiled_copy_O,
             tiled_mma_qk,
             tiled_mma_pv,
@@ -752,7 +741,7 @@ class FlashAttentionForwardSm100:
         mSeqUsedQ: Optional[cute.Tensor],
         mSeqUsedK: Optional[cute.Tensor],
         mPageTable: Optional[cute.Tensor],
-        tma_atom_Q: cute.CopyAtom,
         tma_atom_K: Optional[cute.CopyAtom],
         tma_atom_V: Optional[cute.CopyAtom],
         tma_atom_O: Optional[cute.CopyAtom],
@@ -767,6 +756,7 @@ class FlashAttentionForwardSm100:
         tP_layout: cute.ComposedLayout,
         sV_layout: cute.ComposedLayout,
         sO_layout: cute.ComposedLayout,
         gmem_tiled_copy_O: Optional[cute.TiledCopy],
         tiled_mma_qk: cute.TiledMma,
         tiled_mma_pv: cute.TiledMma,
@@ -814,7 +804,7 @@ class FlashAttentionForwardSm100:
         storage = smem.allocate(self.shared_storage)
         tmem_alloc_barrier = pipeline.NamedBarrier(
-            barrier_id=int(NamedBarrierFwd.TmemPtr),
             num_threads=cute.arch.WARP_SIZE * len(
                 (self.mma_warp_id,
                  *self.softmax0_warp_ids,
@@ -833,8 +823,8 @@ class FlashAttentionForwardSm100:
         ThreadCooperativeGroup = partial(pipeline.CooperativeGroup, pipeline.Agent.Thread)
         mma_warp = ThreadCooperativeGroup(len([self.mma_warp_id]))
-        load_warps = ThreadCooperativeGroup(len(self.load_warp_ids))
         tma_warp = ThreadCooperativeGroup(1)
         softmax_warps = ThreadCooperativeGroup(len(self.softmax0_warp_ids))
         softmax_threads = ThreadCooperativeGroup(cute.arch.WARP_SIZE * len(self.softmax0_warp_ids))
         # softmax_threads = ThreadCooperativeGroup(cute.arch.WARP_SIZE)
@@ -857,15 +847,25 @@ class FlashAttentionForwardSm100:
         softmax_correction_threads_cluster = ThreadCooperativeGroup(
             cute.arch.WARP_SIZE * len(self.softmax0_warp_ids + self.correction_warp_ids) * self.cta_group_size
         )
-        pipeline_q = pipeline_custom.PipelineTmaUmma.create(
-            barrier_storage=storage.mbar_load_Q.data_ptr(),
-            num_stages=self.q_stage,
-            producer_group=tma_warp,
-            consumer_group=mma_warp,
-            tx_count=self.tma_copy_bytes["Q"],
-            cta_layout_vmnk=cta_layout_vmnk,
-            defer_sync=True,
-        )
         if const_expr(self.use_tma_KV):
             pipeline_kv = pipeline_custom.PipelineTmaUmma.create(
                 barrier_storage=storage.mbar_load_KV.data_ptr(),
@@ -877,13 +877,10 @@ class FlashAttentionForwardSm100:
                 defer_sync=True,
             )
         else:
-            cpasync_producer_group = pipeline.CooperativeGroup(
-                pipeline.Agent.Thread, len(self.load_warp_ids) * cute.arch.WARP_SIZE
-            )
             pipeline_kv = pipeline.PipelineAsyncUmma.create(
                 barrier_storage=storage.mbar_load_KV.data_ptr(),
                 num_stages=self.kv_stage,
-                producer_group=cpasync_producer_group,
                 consumer_group=mma_warp,
                 cta_layout_vmnk=cta_layout_vmnk,
                 defer_sync=True,
@@ -938,7 +935,7 @@ class FlashAttentionForwardSm100:
         )
         # Should put the NamedBarrier inside the pipeline class so we'll just have pipeline_sm_stats
         sm_stats_barrier = pipeline_custom.NamedBarrier(
-            barrier_id=int(NamedBarrierFwd.SoftmaxStatsW0), num_threads=cute.arch.WARP_SIZE * 2
         )
         pipeline_o_epi = None
         if const_expr(not self.use_correction_warps_for_epi):
@@ -1019,17 +1016,69 @@ class FlashAttentionForwardSm100:
             window_size_right=window_size_right,
             qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
-        TileSchedulerCls = partial(self.tile_scheduler_cls.create, tile_sched_params)
         # Cluster wait before tensor memory alloc
         pipeline_init_wait(cluster_shape_mn=cta_layout_vmnk)
         # ///////////////////////////////////////////////////////////////////////////////
-        #  EMPTY
         # ///////////////////////////////////////////////////////////////////////////////
-        for i in cutlass.range_constexpr(len(self.empty_warp_ids)):
-            if warp_idx == self.empty_warp_ids[i]:
                 cute.arch.setmaxregister_decrease(self.num_regs_other)
         # ///////////////////////////////////////////////////////////////////////////////
         #  LOAD
@@ -1049,13 +1098,14 @@ class FlashAttentionForwardSm100:
                 tma_atom_Q,
                 tma_atom_K,
                 tma_atom_V,
                 pipeline_q,
                 pipeline_kv,
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
-                TileSchedulerCls,
                 blocksparse_tensors,
             )
         # ///////////////////////////////////////////////////////////////////////////////
@@ -1085,8 +1135,8 @@ class FlashAttentionForwardSm100:
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
-                TileSchedulerCls,
                 blocksparse_tensors,
             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
@@ -1108,8 +1158,8 @@ class FlashAttentionForwardSm100:
                     block_info,
                     num_splits,
                     SeqlenInfoCls,
-                    TileSchedulerCls,
                     mma_tile_coord_v,
                 )
         # ///////////////////////////////////////////////////////////////////////////////
@@ -1141,11 +1191,11 @@ class FlashAttentionForwardSm100:
                 num_splits=num_splits,
                 SeqlenInfoCls=SeqlenInfoCls,
                 AttentionMaskCls=AttentionMaskCls,
-                TileSchedulerCls=TileSchedulerCls,
                 aux_tensors=aux_tensors,
                 fastdiv_mods=fastdiv_mods,
                 head_divmod=head_divmod,
                 blocksparse_tensors=blocksparse_tensors,
             )
             if const_expr(not self.s0_s1_barrier):
@@ -1189,8 +1239,8 @@ class FlashAttentionForwardSm100:
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
-                TileSchedulerCls,
                 blocksparse_tensors,
             )
             tmem_alloc_barrier.arrive()
@@ -1208,35 +1258,38 @@ class FlashAttentionForwardSm100:
         sK: cute.Tensor,
         sV: cute.Tensor,
         mPageTable: Optional[cute.Tensor],
-        tma_atom_Q: cute.CopyAtom,
         tma_atom_K: Optional[cute.CopyAtom],
         tma_atom_V: Optional[cute.CopyAtom],
         pipeline_q: pipeline.PipelineAsync,
         pipeline_kv: pipeline.PipelineAsync,
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
-        TileSchedulerCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors],
     ):
         num_load_threads = len(self.load_warp_ids) * cute.arch.WARP_SIZE
         tidx = cute.arch.thread_idx()[0] % num_load_threads
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         q_producer_phase = Int32(1)
         kv_producer_state = pipeline.make_pipeline_state(
             pipeline.PipelineUserType.Producer, self.kv_stage
         )
-        tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
             seqlen = SeqlenInfoCls(batch_idx)
             mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
-            tiler_gQ = ((self.mma_tiler_qk[0] * self.q_stage), self.head_dim_padded)
-            gQ = cute.local_tile(mQ_cur, tiler_gQ, (m_block, 0))  # (128 * 2, 128)
-            gQ = layout_utils.select(
-                cute.flat_divide(gQ, (self.mma_tiler_qk[0],)), mode=[0, 2, 1]
-            )  # (128, 128, 2)
             head_idx_kv = (
                 head_idx // self.qhead_per_kvhead if const_expr(not self.pack_gqa) else head_idx
@@ -1258,12 +1311,32 @@ class FlashAttentionForwardSm100:
                 gV = cute.local_tile(
                     mV_cur, cute.select(self.mma_tiler_pv, mode=[1, 2]), (0, None, None)
                 )
-            tSgQ = thr_mma_qk.partition_A(gQ)
             tSgK = thr_mma_qk.partition_B(gK)
             tOgV = thr_mma_pv.partition_B(gV)
-            load_Q_fn, _, _ = copy_utils.tma_get_copy_fn(
-                tma_atom_Q, 0, cute.make_layout(1), tSgQ, sQ
-            )
             if const_expr(self.use_tma_KV):
                 tKsK, tKgK = cpasync.tma_partition(
@@ -1302,7 +1375,6 @@ class FlashAttentionForwardSm100:
                 tKsK, tKgK = None, None
                 tVsV, tVgV = None, None
-            load_Q = partial(self.load_Q, load_Q_fn, pipeline_q=pipeline_q, phase=q_producer_phase)
             load_K = partial(
                 self.load_KV,
                 tma_atom_K,
@@ -1337,24 +1409,19 @@ class FlashAttentionForwardSm100:
                     )
                     if const_expr(not self.use_tma_KV):
                         paged_kv_manager.load_page_table(n_block_first)
-                    load_K(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx)  # K0
                     # load_K(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx, extra_tx_count=self.tma_copy_bytes["Q"])  # K0
-                    if const_expr(len(self.load_warp_ids) == 1) or warp_idx == self.load_warp_ids[0]:
-                        # load_Q(block=0, stage=0)  # Q0
-                        pipeline_q.producer_acquire_w_index_phase(0, q_producer_phase)
-                        # pipeline_q.sync_object_empty.wait(0, q_producer_phase)
-                        tma_bar_ptr = pipeline_q.sync_object_full.get_barrier(0)
-                        # tma_bar_ptr = pipeline_kv.producer_get_barrier(kv_producer_state)
-                        load_Q_fn(src_idx=0, dst_idx=0, tma_bar_ptr=tma_bar_ptr)
-                    kv_producer_state.advance()
-                    if const_expr(self.q_stage == 2) and (const_expr(len(self.load_warp_ids) == 1) or warp_idx == self.load_warp_ids[0]):
-                        # load_Q(block=1, stage=1)  # Q1
-                        pipeline_q.producer_acquire_w_index_phase(1, q_producer_phase)
-                        tma_bar_ptr = pipeline_q.sync_object_full.get_barrier(1)
-                        load_Q_fn(src_idx=1, dst_idx=1, tma_bar_ptr=tma_bar_ptr)
                     q_producer_phase ^= 1
-                    load_V(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx)  # V0
-                    kv_producer_state.advance()
                     for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
                         n_block = n_block_max - 2 - i
                         page_idx = (
@@ -1365,10 +1432,11 @@ class FlashAttentionForwardSm100:
                         if const_expr(not self.use_tma_KV):
                             paged_kv_manager.load_page_table(n_block)
                     # if cute.arch.thread_idx()[0] % 32 == 0: cute.printf("n_block = {}, page_idx = {}", n_block, page_idx)
-                        load_K(block=n_block, producer_state=kv_producer_state, page_idx=page_idx)  # Ki
-                        kv_producer_state.advance()
-                        load_V(block=n_block, producer_state=kv_producer_state, page_idx=page_idx)  # Vi
-                        kv_producer_state.advance()
             else:
                 kv_producer_state, q_producer_phase = produce_block_sparse_loads_sm100(
@@ -1387,14 +1455,14 @@ class FlashAttentionForwardSm100:
                     self.q_subtile_factor if self.q_subtile_factor is not None else 1,
                 )
-            tile_scheduler.prefetch_next_work()
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
             # End of persistent scheduler loop
-        pipeline_kv.producer_tail(kv_producer_state)
-        # This is equivalent to pipeline_q.producer_tail
-        if const_expr(len(self.load_warp_ids) == 1) or warp_idx == self.load_warp_ids[0]:
             pipeline_q.producer_acquire_w_index_phase(self.q_stage - 1, q_producer_phase)
     @cute.jit
@@ -1417,8 +1485,8 @@ class FlashAttentionForwardSm100:
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
-        TileSchedulerCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors],
     ):
         tSrQ = tiled_mma_qk.make_fragment_A(sQ)
         tSrK = tiled_mma_qk.make_fragment_B(sK)
@@ -1507,7 +1575,6 @@ class FlashAttentionForwardSm100:
         )
         P_full_O_rescaled_phase = Int32(0)
-        tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
@@ -1678,8 +1745,7 @@ class FlashAttentionForwardSm100:
                 # End of GEMM_PV1(i_end) (P1 * Vi_end -> O1)
             # Advance to next tile
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
         # End of persistent scheduler loop
         # We don't need pipeline_s_p_o.producer_tail() since there's no dangling mbarrier at the end
@@ -1708,11 +1774,11 @@ class FlashAttentionForwardSm100:
         num_splits: Int32,
         SeqlenInfoCls: Callable,
         AttentionMaskCls: Callable,
-        TileSchedulerCls: Callable,
         aux_tensors: Optional[list] = None,
         fastdiv_mods=(None, None),
         head_divmod=None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
     ):
         """Compute softmax on attention scores from QK matrix multiplication.
@@ -1772,7 +1838,6 @@ class FlashAttentionForwardSm100:
         warp_idx_in_wg = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
-        tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
@@ -2015,8 +2080,7 @@ class FlashAttentionForwardSm100:
             #         gLSE[tidx] = lse
             # Advance to next tile
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
         # End of persistent scheduler loop
         # This is equivalent to pipeline_sm_stats.producer_tail
@@ -2186,8 +2250,8 @@ class FlashAttentionForwardSm100:
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
-        TileSchedulerCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
     ):
         tidx = cute.arch.thread_idx()[0] % (cute.arch.WARP_SIZE * len(self.correction_warp_ids))
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
@@ -2217,7 +2281,6 @@ class FlashAttentionForwardSm100:
         o_corr_consumer_phase = Int32(0)
         corr_epi_producer_phase = Int32(1)
-        tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
@@ -2228,12 +2291,14 @@ class FlashAttentionForwardSm100:
                 mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx, split_idx]
             else:
                 mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx]
-            tiler_gO = ((self.mma_tiler_pv[0] * self.q_stage), self.head_dim_v_padded)
-            gO = cute.local_tile(mO_cur, tiler_gO, (m_block, 0))  # (128 * 2, 128)
-            gO = layout_utils.select(
-                cute.flat_divide(gO, (self.mma_tiler_pv[0],)), mode=[0, 2, 1]
-            )  # (128, 128, 2)
-            gO = cute.flat_divide(gO, (self.mma_tiler_pv[0] // self.cta_group_size,))[None, mma_tile_coord_v, None, None]
             # Default LSE to -inf for invalid split_idx tiles
             stats = [(0.0, -Float32.inf if const_expr(mLSE is not None or learnable_sink is not None) else None, True)] * self.q_stage
@@ -2334,6 +2399,7 @@ class FlashAttentionForwardSm100:
                     pipeline_o_acc.consumer_wait_w_index_phase(stage, o_corr_consumer_phase)
                     if const_expr(not self.use_correction_warps_for_epi):
                         pipeline_o_epi.producer_acquire_w_index_phase(stage, corr_epi_producer_phase)
                     self.correction_epilogue(
                         thr_mma_pv,
                         tOtO[None, None, None, stage],
@@ -2344,7 +2410,7 @@ class FlashAttentionForwardSm100:
                         scale,
                         sO[None, None, stage],
                         mO_cur,
-                        gO[None, None, stage],
                         gmem_tiled_copy_O,
                     )
                     # Signal for the next work tile that O buffers in tmem are already read, so
@@ -2414,7 +2480,6 @@ class FlashAttentionForwardSm100:
                         mLSE_cur = cute.domain_offset((offset,), mLSE[None, head_idx])
                 for stage in cutlass.range_constexpr(self.q_stage):
                     m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
-                    gLSE = cute.local_tile(mLSE_cur, (self.m_block_size,), (m_tile_idx,))
                     row_sum, row_max, acc_O_mn_row_is_zero_or_nan = stats[stage]
                     # if tidx == 0 and stage <= 1:
                     #     cute.printf("row_sum = {}, row_max = {}, acc_O_mn_row_is_zero_or_nan = {}\n", row_sum, row_max, acc_O_mn_row_is_zero_or_nan)
@@ -2429,13 +2494,24 @@ class FlashAttentionForwardSm100:
                         if const_expr(not self.pack_gqa)
                         else seqlen.seqlen_q * self.qhead_per_kvhead
                     )
-                    if tidx < seqlen_q - m_tile_idx * self.m_block_size:
-                        # This actually just works with PackGQA too
-                        gLSE[tidx] = lse
             # Advance to next tile
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
         # End of persistent scheduler loop
         # This is equivalent to pipeline_o_epi.consumer_tail() for the correction warps
@@ -2574,7 +2650,7 @@ class FlashAttentionForwardSm100:
         if const_expr(self.use_correction_warps_for_epi):
             assert(not self.use_tma_O)
             assert(gmem_tiled_copy_O is not None)
-            cute.arch.barrier(barrier_id=int(NamedBarrierFwd.Epilogue),
                               number_of_threads=len(self.epilogue_warp_ids) * cute.arch.WARP_SIZE)
             mma_tile_coord_v = thr_mma.thr_idx
             m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
@@ -2586,7 +2662,7 @@ class FlashAttentionForwardSm100:
     def _store_O_to_gmem(
         self,
         sO_stage: cute.Tensor,
-        gO: cute.Tensor,
         mO_cur: cute.Tensor,
         gmem_tiled_copy_O: cute.TiledCopy,
         tidx: Int32,
@@ -2597,7 +2673,6 @@ class FlashAttentionForwardSm100:
         gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
         tOsO = gmem_thr_copy_O.partition_S(sO_stage)
         cO = cute.make_identity_tensor((self.m_block_size, self.head_dim_v_padded))
-        tOgO = gmem_thr_copy_O.partition_D(gO)
         tOcO = gmem_thr_copy_O.partition_S(cO)
         t0OcO = gmem_tiled_copy_O.get_slice(0).partition_S(cO)
         tOpO = copy_utils.predicate_k(tOcO, limit=mO_cur.shape[1])
@@ -2613,6 +2688,8 @@ class FlashAttentionForwardSm100:
         cute.autovec_copy(tOsO, tOrO)
         # copy acc O from rmem to gmem
         if const_expr(not self.pack_gqa):
             for rest_m in cutlass.range_constexpr(cute.size(tOrO.shape[1])):
                 if (
                     t0OcO[0, rest_m, 0][0] < seqlen_q - m_tile_idx * self.m_block_size - tOcO[0][0]
@@ -2641,11 +2718,10 @@ class FlashAttentionForwardSm100:
         block_info: BlockInfo,
         num_splits: int,
         SeqlenInfoCls: Callable,
-        TileSchedulerCls: Callable,
         mma_tile_coord_v: Int32 = 0,
     ):
         epi_consumer_phase = Int32(0)
-        tile_scheduler = TileSchedulerCls()
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
@@ -2657,12 +2733,14 @@ class FlashAttentionForwardSm100:
                     mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx, split_idx]
                 else:
                     mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx]
-                tiler_gO = ((self.mma_tiler_pv[0] * self.q_stage), self.head_dim_v_padded)
-                gO = cute.local_tile(mO_cur, tiler_gO, (m_block, 0))  # (128 * 2, 128)
-                gO = layout_utils.select(
-                    cute.flat_divide(gO, (self.mma_tiler_pv[0],)), mode=[0, 2, 1]
-                )  # (128, 128, 2)
-                gO = cute.flat_divide(gO, (self.mma_tiler_pv[0] // self.cta_group_size,))[None, mma_tile_coord_v, None, None]
                 if const_expr(self.use_tma_O):
                     store_O, _, _ = copy_utils.tma_get_copy_fn(
@@ -2689,8 +2767,9 @@ class FlashAttentionForwardSm100:
                         pipeline_o_epi.consumer_wait_w_index_phase(stage, epi_consumer_phase)
                         # 2. copy O0 / O1 to gmem
                         m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
                         self._store_O_to_gmem(
-                            sO[None, None, stage], gO[None, None, stage], mO_cur, gmem_tiled_copy_O,
                             tidx, seqlen.seqlen_q, m_tile_idx,
                         )
                         pipeline_o_epi.consumer_release_w_index(stage)
@@ -2698,8 +2777,39 @@ class FlashAttentionForwardSm100:
                 epi_consumer_phase ^= 1
             # Advance to next tile
-            tile_scheduler.advance_to_next_work()
-            work_tile = tile_scheduler.get_current_work()
     def load_Q(
         self,
@@ -2712,6 +2822,39 @@ class FlashAttentionForwardSm100:
         pipeline_q.producer_acquire_w_index_phase(stage, phase)
         load_Q_fn(src_idx=block, dst_idx=stage, tma_bar_ptr=pipeline_q.sync_object_full.get_barrier(stage))
     @cute.jit
     def load_KV(
         self,
@@ -2754,7 +2897,10 @@ class FlashAttentionForwardSm100:
         else:
             assert paged_kv_manager is not None
             assert extra_tx_count is None
-            paged_kv_manager.load_KV(block, sX[None, None, None, stage], K_or_V)
             cute.arch.cp_async_commit_group()
             pipeline_kv.sync_object_full.arrive_cp_async_mbarrier(stage)
@@ -2765,6 +2911,9 @@ class FlashAttentionForwardSm100:
             # (smem_large + smem_small) // 2. So for stage == 1, move right by offset if
             # phase == 0, or left by offset if phase == 1.
             offset = 0 if stage != 1 else self.uneven_kv_smem_offset * (1 - 2 * phase)
             return cute.make_tensor(sX.iterator + offset, sX.layout)
         else:
             return sX
@@ -2774,12 +2923,12 @@ class FlashAttentionForwardSm100:
     #     warp_group_idx = utils.canonical_warp_group_idx(sync=False)
     #     if warp_group_idx == 0:
     #         cute.arch.barrier_arrive(
-    #             barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1), number_of_threads=2 * 128,
     #         )
     # def warp_scheduler_barrier_sync(self):
     #     cute.arch.barrier(
-    #         barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1) + utils.canonical_warp_group_idx(sync=False),
     #         number_of_threads=2 * 128
     #     )
@@ -2787,7 +2936,7 @@ class FlashAttentionForwardSm100:
     #     cur_wg = utils.canonical_warp_group_idx(sync=False)
     #     next_wg = 1 - cur_wg
     #     cute.arch.barrier_arrive(
-    #         barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1) + next_wg, number_of_threads=2 * 128,
     #     )
     @cute.jit

 # https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/fmha.py
 import math
+from typing import Tuple, Callable, Optional, Literal
 from functools import partial
 import cuda.bindings.driver as cuda
 import cutlass.utils.blackwell_helpers as sm100_utils_basic
 from cutlass import pipeline
 from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
+from cutlass.utils import ClcDynamicPersistentTileScheduler
 from cutlass.base_dsl.arch import Arch
 from cutlass.cutlass_dsl import BaseDSL
 from .paged_kv import PagedKVManager
 from .cute_dsl_utils import assume_tensor_aligned
+from . import utils
 from . import pipeline as pipeline_custom
+import cutlass.pipeline as cutlass_pipeline
 from .mask import AttentionMask
 from .softmax import SoftmaxSm100, apply_score_mod_inner
 from .seqlen_info import SeqlenInfoQK
     softmax_block_sparse_sm100,
     handle_block_sparse_empty_tile_correction_sm100,
 )
+from .pack_gqa import PackGQA, pack_gqa_layout
 from . import mma_sm100_desc as sm100_desc
 from . import blackwell_helpers as sm100_utils
+from .named_barrier import NamedBarrierFwdSm100
 from cutlass.cute import FastDivmodDivisor
 from .quack.cute_dsl_utils import ParamsBase
 from .tile_scheduler import (
+    ClcState,
+    SchedulingMode,
     TileSchedulerArguments,
+    TileSchedulerProtocol,
     SingleTileScheduler,
     StaticPersistentTileScheduler,
     SingleTileLPTScheduler,
     SingleTileVarlenScheduler,
 )
+from .fa_logging import fa_log, fa_printf
+from .utils import smid
+# === TUNING KNOBS (agent-editable) ===
+# Keys: (use_2cta_instrs: bool, is_causal: bool, head_dim_padded: int, is_sm103: bool)
+# Values:
+#   ex2_emu_freq: int — how often to use emulated exp2 (0=all hardware exp2, higher=more emulation).
+#                        SM103 has fast native exp2, so set freq=0 there.
+#   ex2_emu_start_frg: int — fragment index to start emulation from
+#   num_regs_softmax: int — register count for softmax warps (multiple of 8)
+#   num_regs_correction: int — register count for correction warps (multiple of 8)
+#   num_regs_other is derived: 512 - num_regs_softmax * 2 - num_regs_correction
+_TUNING_CONFIG = {
+    (True, False, 128, False): {'ex2_emu_freq': 10, 'ex2_emu_start_frg': 1, 'num_regs_softmax': 176, 'num_regs_correction': 88},
+    (False, True, 128, False): {'ex2_emu_freq': 16, 'ex2_emu_start_frg': 1, 'num_regs_softmax': 192, 'num_regs_correction': 72},
+    (True, False, 192, False): {"ex2_emu_freq": 16, "ex2_emu_start_frg": 0, "num_regs_softmax": 184, "num_regs_correction": 80},
+    (False, True, 192, False): {"ex2_emu_freq": 32, "ex2_emu_start_frg": 1, "num_regs_softmax": 192, "num_regs_correction": 72},
+    (True, False, 128, True): {"ex2_emu_freq": 0, "ex2_emu_start_frg": 0, "num_regs_softmax": 176, "num_regs_correction": 80},
+    (False, True, 128, True): {"ex2_emu_freq": 0, "ex2_emu_start_frg": 0, "num_regs_softmax": 176, "num_regs_correction": 64},
+    (True, False, 192, True): {"ex2_emu_freq": 0, "ex2_emu_start_frg": 0, "num_regs_softmax": 176, "num_regs_correction": 64},
+    (False, True, 192, True): {"ex2_emu_freq": 0, "ex2_emu_start_frg": 0, "num_regs_softmax": 176, "num_regs_correction": 72},
+}
+# === END TUNING KNOBS ===
 class FlashAttentionForwardSm100:
         paged_kv_non_tma: bool = False,
         is_varlen_q: bool = False,
         use_2cta_instrs: bool = False,
+        use_clc_scheduler: bool = False,
     ):
         self.use_tma_KV = not paged_kv_non_tma
         # self.dtype = dtype
         self.is_split_kv = is_split_kv
         self.pack_gqa = pack_gqa
         self.q_subtile_factor = q_subtile_factor
         assert not (self.is_split_kv and self.head_dim_v_padded >= 192), (
             "SplitKV is not supported for hdim >= 192"
         )
         # Does S1 need to wait for S0 to finish
         # self.s0_s1_barrier = self.head_dim_padded in [64, 96] and (not self.is_causal and not self.is_local)
         is_sm103 = self.arch >= Arch.sm_103 and self.arch <= Arch.sm_103f
+        self.is_sm103 = is_sm103
+        # enable_ex2_emu is derived: True if tuning config has freq > 0, else fallback to default logic
+        _default_enable_ex2_emu = (self.head_dim_padded <= 128 or (self.head_dim_padded == 192 and self.use_2cta_instrs and not self.is_causal and not self.is_local)) and not is_sm103
+        self.enable_ex2_emu = _default_enable_ex2_emu
         self.s0_s1_barrier = False
         self.overlap_sO_sQ = (
             (self.head_dim_padded == 192 and self.head_dim_v_padded >= 64) or
             "Paged KV does not support irregular head dim"
         )
+        self.use_clc_scheduler = (
+            use_clc_scheduler
+            and self.use_tma_KV
+            and not self.overlap_sO_sQ
+        )
+        self.sched_stages = 1
+        if self.use_clc_scheduler:
+            assert self.cluster_shape_mn[1] == 1, f"CLC requires cluster N == 1: {self.cluster_shape_mn}"
+            assert self.cluster_shape_mn[0] in (1, 2), f"bad CLC cluster M: {self.cluster_shape_mn}"
+            assert self.cluster_shape_mn[0] == self.cta_group_size, (
+                f"CLC cluster M != cta_group_size: {self.cluster_shape_mn}, {self.cta_group_size}"
+            )
+        self.scheduling_mode = SchedulingMode.CLC if self.use_clc_scheduler else SchedulingMode.STATIC
+        if is_varlen_q:
+            self.TileScheduler = SingleTileVarlenScheduler
+        elif self.is_causal or self.is_local or self.use_clc_scheduler:
+            self.TileScheduler = SingleTileLPTScheduler
+        elif self.is_persistent:
+            self.TileScheduler = StaticPersistentTileScheduler
+        else:
+            self.TileScheduler = SingleTileScheduler
+        fa_log(1, f"TileScheduler={self.TileScheduler.__name__}, scheduling_mode={self.scheduling_mode.name}, USE_2CTA={self.use_2cta_instrs}")
         self.softmax0_warp_ids = (0, 1, 2, 3)
         self.softmax1_warp_ids = (4, 5, 6, 7)
         self.correction_warp_ids = (8, 9, 10, 11)
             )
         )
+        self.use_tma_Q = not (self.pack_gqa and self.m_block_size % self.qhead_per_kvhead != 0)
         if self.q_stage == 1:
+            if not self.use_tma_KV or not self.use_tma_Q:
                 self.empty_warp_ids = self.empty_warp_ids + self.load_warp_ids
                 self.load_warp_ids = self.softmax1_warp_ids
             else:
         elif self.is_varlen_q: # fallback
             self.epilogue_warp_ids = (13, 14)
+        self.clc_scheduler_warp_id = self.empty_warp_ids[0] if self.use_clc_scheduler else None
         self.tmem_s_offset = [0, self.n_block_size]  # e.g., 0, 128
         self.tmem_o_offset = [
             self.tmem_s_offset[-1] + self.n_block_size + i * self.head_dim_v_padded
         # vec buffer for row_max & row_sum
         self.tmem_vec_offset = self.tmem_s_offset
+        # Look up tuning config for register counts and ex2_emu params
+        _tune_key = (self.use_2cta_instrs, self.is_causal, self.head_dim_padded, self.is_sm103)
+        self._tune = _TUNING_CONFIG.get(_tune_key, {})
+        if "ex2_emu_freq" in self._tune:
+            self.enable_ex2_emu = self._tune["ex2_emu_freq"] > 0
         if self.head_dim_padded < 96:
             self.num_regs_softmax = 200 if not paged_kv_non_tma else 184
             self.num_regs_correction = 64
             self.num_regs_other = 48 if not paged_kv_non_tma else 80
         else:
+            if not paged_kv_non_tma and "num_regs_softmax" in self._tune:
+                self.num_regs_softmax = self._tune["num_regs_softmax"]
+                self.num_regs_correction = self._tune["num_regs_correction"]
+            elif not paged_kv_non_tma:
+                self.num_regs_softmax = 192
+                self.num_regs_correction = 80
             else:
+                self.num_regs_softmax = 184
+                self.num_regs_correction = 64
+            self.num_regs_other = 512 - self.num_regs_softmax * 2 - self.num_regs_correction
         self.buffer_align_bytes = 1024
             self.head_dim_padded == 192 and self.head_dim_v_padded == 128 and self.kv_stage == 3
         )
         self.uneven_kv_smem_offset = (
+            self.n_block_size * (self.head_dim_padded - self.head_dim_v_padded) // 2
             if self.uneven_kv_smem
             else 0
         )
         mO: cute.Tensor,  # (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
         mLSE: Optional[cute.Tensor],
         softmax_scale: Float32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         learnable_sink: Optional[cute.Tensor] = None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
         aux_tensors: Optional[list] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
     ):
         """Execute the Fused Multi-Head Attention operation on the provided tensors.
         if const_expr(self.q_dtype != self.v_dtype):
             raise TypeError(f"Type mismatch: {self.q_dtype} != {self.v_dtype}")
         self._setup_attributes()
+        self.use_tma_O = (
+            self.arch >= Arch.sm_90
+            and mCuSeqlensQ is None
+            and mSeqUsedQ is None
+            and not (self.pack_gqa and self.m_block_size % self.qhead_per_kvhead != 0)
+            and not (self.pack_gqa and self.is_split_kv)
+        )
         self.ex2_emu_freq = 0
+        self.ex2_emu_start_frg = self._tune.get("ex2_emu_start_frg", 1)
         if const_expr(self.enable_ex2_emu):
+            self.ex2_emu_freq = self._tune.get("ex2_emu_freq", 16)
             if const_expr(
                 self.pack_gqa and self.head_dim_padded > 64 and not self.is_causal and not self.is_local
             ):
+                self.ex2_emu_freq = 32 if mCuSeqlensQ is not None or mSeqUsedQ is not None else self._tune.get("ex2_emu_freq", 10)
         cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
         q_major_mode = tcgen05.OperandMajorMode.K
             )
         if const_expr(self.pack_gqa):
+            nheads_kv = mK.shape[2]
+            mQ = pack_gqa_layout(mQ, self.qhead_per_kvhead, nheads_kv, head_idx=2)
+            mO = pack_gqa_layout(mO, self.qhead_per_kvhead, nheads_kv, head_idx=2)
             if const_expr(mLSE is not None):
+                mLSE = pack_gqa_layout(mLSE, self.qhead_per_kvhead, nheads_kv, head_idx=1)
         self.tma_copy_bytes = {
             name: cute.size_in_bytes(mX.element_type, cute.select(layout, mode=[0, 1, 2]))
         tma_load_op = cpasync.CopyBulkTensorTileG2SOp(cta_group)
         tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+        if const_expr(self.use_tma_Q):
+            tma_atom_Q, mQ = cute.nvgpu.make_tiled_tma_atom_A(
+                tma_load_op,
+                mQ,
+                cute.select(sQ_layout, mode=[0, 1, 2]),
+                self.mma_tiler_qk,
+                tiled_mma_qk,
+                cta_layout_vmnk.shape,
+            )
+            gmem_tiled_copy_Q = None
+        else:
+            tma_atom_Q = None
+            async_copy_elems = 128 // self.q_dtype.width
+            num_load_threads = cute.arch.WARP_SIZE * len(self.load_warp_ids)
+            threads_per_row = math.gcd(self.head_dim_padded // async_copy_elems, num_load_threads)
+            gmem_tiled_copy_Q = copy_utils.tiled_copy_2d(
+                self.q_dtype, threads_per_row, num_load_threads, async_copy_elems, is_async=True
+            )
         tma_atom_K = None
         tma_atom_V = None
             vO_layout = cute.make_layout((1, async_copy_elems))
             gmem_tiled_copy_O = cute.make_tiled_copy_tv(atom_universal_copy, tO_layout, vO_layout)
+        TileScheduler = self.TileScheduler
+        _num_block_divisor = self.cta_tiler[0] * (self.cta_group_size if not self.is_persistent and self.cta_group_size > 1 else 1)
         tile_sched_args = TileSchedulerArguments(
+            cute.ceil_div(cute.size(mQ.shape[0]), _num_block_divisor),
             cute.size(mQ.shape[2]),
             cute.size(mQ.shape[3])
             if const_expr(mCuSeqlensQ is None)
             lpt=self.is_causal or self.is_local,
             is_split_kv=self.is_split_kv,
             cluster_shape_mn=self.cluster_shape_mn,
+            use_cluster_idx=not self.is_persistent and self.cta_group_size > 1,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(
+            tile_sched_args, scheduling_mode=self.scheduling_mode
         )
         self.tile_scheduler_cls = TileScheduler
         grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
             cutlass.max(cute.cosize(sQ_layout), cute.cosize(sO_layout) * self.o_dtype.width // self.q_dtype.width)
         )
+        clc_response_size = self.sched_stages * 4 if self.use_clc_scheduler else 0
+        clc_mbar_size = self.sched_stages * 2 if self.use_clc_scheduler else 0
         @cute.struct
         class SharedStorage:
             # m_barriers for pipelines
             # Smem tensors
             # store row max and row sum
             sScale: cute.struct.MemRange[Float32, self.q_stage * self.m_block_size * 2]
+            # CLC buffers placed here to utilize padding before sO's 1024-byte alignment.
+            # This avoids adding bytes at the end when we're at the smem limit.
+            # PipelineClcFetchAsync expects 2 * sched_stages mbarriers (full + empty).
+            clc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, clc_mbar_size]
+            # CLC response storage (16 bytes per stage, stored as 4 Int32s).
+            clc_response: cute.struct.MemRange[Int32, clc_response_size]
+            # Large TMA buffers with 1024-byte alignment
             sO: cute.struct.Align[
                 cute.struct.MemRange[self.o_dtype, sO_size], self.buffer_align_bytes
             ]
         self.shared_storage = SharedStorage
+        softmax_scale_log2, softmax_scale = utils.compute_softmax_scale_log2(softmax_scale, self.score_mod)
+        window_size_left = Int32(window_size_left) if window_size_left is not None else None
+        window_size_right = Int32(window_size_right) if window_size_right is not None else None
+        fastdiv_mods = utils.compute_fastdiv_mods(mQ, mK, self.qhead_per_kvhead, self.pack_gqa, aux_tensors, mPageTable)
         head_divmod = None
         if cutlass.const_expr(self.pack_gqa):
             tP_layout,
             sV_layout,
             sO_layout,
+            gmem_tiled_copy_Q,
             gmem_tiled_copy_O,
             tiled_mma_qk,
             tiled_mma_pv,
         mSeqUsedQ: Optional[cute.Tensor],
         mSeqUsedK: Optional[cute.Tensor],
         mPageTable: Optional[cute.Tensor],
+        tma_atom_Q: Optional[cute.CopyAtom],
         tma_atom_K: Optional[cute.CopyAtom],
         tma_atom_V: Optional[cute.CopyAtom],
         tma_atom_O: Optional[cute.CopyAtom],
         tP_layout: cute.ComposedLayout,
         sV_layout: cute.ComposedLayout,
         sO_layout: cute.ComposedLayout,
+        gmem_tiled_copy_Q: Optional[cute.TiledCopy],
         gmem_tiled_copy_O: Optional[cute.TiledCopy],
         tiled_mma_qk: cute.TiledMma,
         tiled_mma_pv: cute.TiledMma,
         storage = smem.allocate(self.shared_storage)
         tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierFwdSm100.TmemPtr),
             num_threads=cute.arch.WARP_SIZE * len(
                 (self.mma_warp_id,
                  *self.softmax0_warp_ids,
         ThreadCooperativeGroup = partial(pipeline.CooperativeGroup, pipeline.Agent.Thread)
         mma_warp = ThreadCooperativeGroup(len([self.mma_warp_id]))
         tma_warp = ThreadCooperativeGroup(1)
+        load_threads = ThreadCooperativeGroup(len(self.load_warp_ids) * cute.arch.WARP_SIZE)
         softmax_warps = ThreadCooperativeGroup(len(self.softmax0_warp_ids))
         softmax_threads = ThreadCooperativeGroup(cute.arch.WARP_SIZE * len(self.softmax0_warp_ids))
         # softmax_threads = ThreadCooperativeGroup(cute.arch.WARP_SIZE)
         softmax_correction_threads_cluster = ThreadCooperativeGroup(
             cute.arch.WARP_SIZE * len(self.softmax0_warp_ids + self.correction_warp_ids) * self.cta_group_size
         )
+        if const_expr(self.use_tma_Q):
+            pipeline_q = pipeline_custom.PipelineTmaUmma.create(
+                barrier_storage=storage.mbar_load_Q.data_ptr(),
+                num_stages=self.q_stage,
+                producer_group=tma_warp,
+                consumer_group=mma_warp,
+                tx_count=self.tma_copy_bytes["Q"],
+                cta_layout_vmnk=cta_layout_vmnk,
+                defer_sync=True,
+            )
+        else:
+            pipeline_q = pipeline_custom.PipelineAsyncUmma.create(
+                barrier_storage=storage.mbar_load_Q.data_ptr(),
+                num_stages=self.q_stage,
+                producer_group=load_threads,
+                consumer_group=mma_warp,
+                cta_layout_vmnk=cta_layout_vmnk,
+                defer_sync=True,
+            )
         if const_expr(self.use_tma_KV):
             pipeline_kv = pipeline_custom.PipelineTmaUmma.create(
                 barrier_storage=storage.mbar_load_KV.data_ptr(),
                 defer_sync=True,
             )
         else:
             pipeline_kv = pipeline.PipelineAsyncUmma.create(
                 barrier_storage=storage.mbar_load_KV.data_ptr(),
                 num_stages=self.kv_stage,
+                producer_group=load_threads,
                 consumer_group=mma_warp,
                 cta_layout_vmnk=cta_layout_vmnk,
                 defer_sync=True,
         )
         # Should put the NamedBarrier inside the pipeline class so we'll just have pipeline_sm_stats
         sm_stats_barrier = pipeline_custom.NamedBarrier(
+            barrier_id=int(NamedBarrierFwdSm100.SoftmaxStatsW0), num_threads=cute.arch.WARP_SIZE * 2
         )
         pipeline_o_epi = None
         if const_expr(not self.use_correction_warps_for_epi):
             window_size_right=window_size_right,
             qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
         )
         # Cluster wait before tensor memory alloc
         pipeline_init_wait(cluster_shape_mn=cta_layout_vmnk)
+        if const_expr(self.use_clc_scheduler):
+            clc_response_ptr = storage.clc_response.data_ptr()
+            clc_mbar_ptr = storage.clc_mbar_ptr.data_ptr()
+            clc_pipeline_producer_group = cutlass_pipeline.CooperativeGroup(
+                cutlass_pipeline.Agent.Thread
+            )
+            num_clc_consumer_warps_per_cta = self.threads_per_cta // cute.arch.WARP_SIZE
+            # NB on CTA0 warp15 == scheduler on CTA1 == empty but still both consume
+            num_clc_consumer_warps = num_clc_consumer_warps_per_cta * self.cta_group_size
+            clc_pipeline_consumer_group = cutlass_pipeline.CooperativeGroup(
+                cutlass_pipeline.Agent.Thread, cute.arch.WARP_SIZE * num_clc_consumer_warps
+            )
+            block_idx = cute.arch.block_idx()
+            clc = ClcState.create(
+                hw_scheduler=ClcDynamicPersistentTileScheduler.create(
+                    self.tile_scheduler_cls.clc_problem_shape(tile_sched_params),
+                    block_idx,
+                    cute.arch.grid_dim(),
+                    clc_response_ptr,
+                ),
+                pipeline=cutlass_pipeline.PipelineClcFetchAsync.create(
+                    barrier_storage=clc_mbar_ptr,
+                    num_stages=self.sched_stages,
+                    producer_group=clc_pipeline_producer_group,
+                    consumer_group=clc_pipeline_consumer_group,
+                    tx_count=16,
+                    cta_layout_vmnk=cta_layout_vmnk,
+                ),
+                consumer_state=cutlass_pipeline.make_pipeline_state(
+                    cutlass_pipeline.PipelineUserType.Consumer, self.sched_stages
+                ),
+                producer_state=cutlass_pipeline.make_pipeline_state(
+                    cutlass_pipeline.PipelineUserType.Producer, self.sched_stages
+                ),
+            )
+            tile_scheduler = self.tile_scheduler_cls.create(tile_sched_params, clc=clc)
+        else:
+            tile_scheduler = self.tile_scheduler_cls.create(tile_sched_params)
+        assert isinstance(tile_scheduler, TileSchedulerProtocol), f"tile_scheduler is not a TileSchedulerProtocol: {type(tile_scheduler)}"
         # ///////////////////////////////////////////////////////////////////////////////
+        #  EMPTY / CLC SCHEDULER WARP
         # ///////////////////////////////////////////////////////////////////////////////
+        if const_expr(self.use_clc_scheduler):
+            if warp_idx == self.clc_scheduler_warp_id:
                 cute.arch.setmaxregister_decrease(self.num_regs_other)
+                if is_leader_cta:
+                    self.clc_scheduler_warp(tile_scheduler)
+                else:
+                    self.empty_warp(tile_scheduler)
+            for i in cutlass.range_constexpr(len(self.empty_warp_ids)):
+                if warp_idx == self.empty_warp_ids[i] and warp_idx != self.clc_scheduler_warp_id:
+                    cute.arch.setmaxregister_decrease(self.num_regs_other)
+                    self.empty_warp(tile_scheduler)
+        else:
+            for i in cutlass.range_constexpr(len(self.empty_warp_ids)):
+                if warp_idx == self.empty_warp_ids[i]:
+                    cute.arch.setmaxregister_decrease(self.num_regs_other)
         # ///////////////////////////////////////////////////////////////////////////////
         #  LOAD
                 tma_atom_Q,
                 tma_atom_K,
                 tma_atom_V,
+                gmem_tiled_copy_Q,
                 pipeline_q,
                 pipeline_kv,
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
                 blocksparse_tensors,
+                tile_scheduler=tile_scheduler,
             )
         # ///////////////////////////////////////////////////////////////////////////////
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
                 blocksparse_tensors,
+                tile_scheduler=tile_scheduler,
             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
                     block_info,
                     num_splits,
                     SeqlenInfoCls,
                     mma_tile_coord_v,
+                    tile_scheduler=tile_scheduler,
                 )
         # ///////////////////////////////////////////////////////////////////////////////
                 num_splits=num_splits,
                 SeqlenInfoCls=SeqlenInfoCls,
                 AttentionMaskCls=AttentionMaskCls,
                 aux_tensors=aux_tensors,
                 fastdiv_mods=fastdiv_mods,
                 head_divmod=head_divmod,
                 blocksparse_tensors=blocksparse_tensors,
+                tile_scheduler=tile_scheduler,
             )
             if const_expr(not self.s0_s1_barrier):
                 block_info,
                 num_splits,
                 SeqlenInfoCls,
                 blocksparse_tensors,
+                tile_scheduler=tile_scheduler,
             )
             tmem_alloc_barrier.arrive()
         sK: cute.Tensor,
         sV: cute.Tensor,
         mPageTable: Optional[cute.Tensor],
+        tma_atom_Q: Optional[cute.CopyAtom],
         tma_atom_K: Optional[cute.CopyAtom],
         tma_atom_V: Optional[cute.CopyAtom],
+        gmem_tiled_copy_Q: Optional[cute.TiledCopy],
         pipeline_q: pipeline.PipelineAsync,
         pipeline_kv: pipeline.PipelineAsync,
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors],
+        tile_scheduler: TileSchedulerProtocol,
     ):
         num_load_threads = len(self.load_warp_ids) * cute.arch.WARP_SIZE
         tidx = cute.arch.thread_idx()[0] % num_load_threads
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        issue_kv_for_this_warp = (
+            const_expr(not self.use_tma_KV or len(self.load_warp_ids) == 1) or
+            warp_idx == self.load_warp_ids[0]
+        )
+        issue_q_for_this_warp = (
+            const_expr(not self.use_tma_Q or len(self.load_warp_ids) == 1) or
+            warp_idx == self.load_warp_ids[0]
+        )
         q_producer_phase = Int32(1)
         kv_producer_state = pipeline.make_pipeline_state(
             pipeline.PipelineUserType.Producer, self.kv_stage
         )
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
             seqlen = SeqlenInfoCls(batch_idx)
             mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
             head_idx_kv = (
                 head_idx // self.qhead_per_kvhead if const_expr(not self.pack_gqa) else head_idx
                 gV = cute.local_tile(
                     mV_cur, cute.select(self.mma_tiler_pv, mode=[1, 2]), (0, None, None)
                 )
             tSgK = thr_mma_qk.partition_B(gK)
             tOgV = thr_mma_pv.partition_B(gV)
+            if const_expr(self.use_tma_Q):
+                tiler_gQ = ((self.mma_tiler_qk[0] * self.q_stage), self.head_dim_padded)
+                gQ = cute.local_tile(mQ_cur, tiler_gQ, (m_block, 0))  # (128 * 2, 128)
+                gQ = layout_utils.select(
+                    cute.flat_divide(gQ, (self.mma_tiler_qk[0],)), mode=[0, 2, 1]
+                )  # (128, 128, 2)
+                tSgQ = thr_mma_qk.partition_A(gQ)
+                load_Q_fn, _, _ = copy_utils.tma_get_copy_fn(
+                    tma_atom_Q, 0, cute.make_layout(1), tSgQ, sQ
+                )
+                load_Q = partial(self.load_Q, load_Q_fn, pipeline_q=pipeline_q, phase=q_producer_phase)
+            else:
+                assert gmem_tiled_copy_Q is not None
+                load_Q = partial(
+                    self.load_Q_non_tma,
+                    mQ_cur,
+                    sQ,
+                    gmem_tiled_copy_Q,
+                    pipeline_q,
+                    tidx,
+                    seqlen.seqlen_q,
+                    m_block,
+                    phase=q_producer_phase,
+                )
             if const_expr(self.use_tma_KV):
                 tKsK, tKgK = cpasync.tma_partition(
                 tKsK, tKgK = None, None
                 tVsV, tVgV = None, None
             load_K = partial(
                 self.load_KV,
                 tma_atom_K,
                     )
                     if const_expr(not self.use_tma_KV):
                         paged_kv_manager.load_page_table(n_block_first)
+                    if issue_kv_for_this_warp:
+                        load_K(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx)  # K0
                     # load_K(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx, extra_tx_count=self.tma_copy_bytes["Q"])  # K0
+                    if issue_q_for_this_warp:
+                        load_Q(block=0, stage=0)
+                    if issue_kv_for_this_warp:
+                        kv_producer_state.advance()
+                    if const_expr(self.q_stage == 2) and issue_q_for_this_warp:
+                        load_Q(block=1, stage=1)
                     q_producer_phase ^= 1
+                    if issue_kv_for_this_warp:
+                        load_V(block=n_block_max - 1, producer_state=kv_producer_state, page_idx=page_idx)  # V0
+                        kv_producer_state.advance()
                     for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
                         n_block = n_block_max - 2 - i
                         page_idx = (
                         if const_expr(not self.use_tma_KV):
                             paged_kv_manager.load_page_table(n_block)
                     # if cute.arch.thread_idx()[0] % 32 == 0: cute.printf("n_block = {}, page_idx = {}", n_block, page_idx)
+                        if issue_kv_for_this_warp:
+                            load_K(block=n_block, producer_state=kv_producer_state, page_idx=page_idx)  # Ki
+                            kv_producer_state.advance()
+                            load_V(block=n_block, producer_state=kv_producer_state, page_idx=page_idx)  # Vi
+                            kv_producer_state.advance()
             else:
                 kv_producer_state, q_producer_phase = produce_block_sparse_loads_sm100(
                     self.q_subtile_factor if self.q_subtile_factor is not None else 1,
                 )
+            work_tile = tile_scheduler.advance_to_next_work()
             # End of persistent scheduler loop
+        if issue_kv_for_this_warp:
+            pipeline_kv.producer_tail(kv_producer_state)
+        # This is equivalent to pipeline_q.producer_tail for the TMA-Q producer warp.
+        if issue_q_for_this_warp:
             pipeline_q.producer_acquire_w_index_phase(self.q_stage - 1, q_producer_phase)
     @cute.jit
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors],
+        tile_scheduler=None,
     ):
         tSrQ = tiled_mma_qk.make_fragment_A(sQ)
         tSrK = tiled_mma_qk.make_fragment_B(sK)
         )
         P_full_O_rescaled_phase = Int32(0)
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
                 # End of GEMM_PV1(i_end) (P1 * Vi_end -> O1)
             # Advance to next tile
+            work_tile = tile_scheduler.advance_to_next_work()
         # End of persistent scheduler loop
         # We don't need pipeline_s_p_o.producer_tail() since there's no dangling mbarrier at the end
         num_splits: Int32,
         SeqlenInfoCls: Callable,
         AttentionMaskCls: Callable,
         aux_tensors: Optional[list] = None,
         fastdiv_mods=(None, None),
         head_divmod=None,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        tile_scheduler=None,
     ):
         """Compute softmax on attention scores from QK matrix multiplication.
         warp_idx_in_wg = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
             #         gLSE[tidx] = lse
             # Advance to next tile
+            work_tile = tile_scheduler.advance_to_next_work()
         # End of persistent scheduler loop
         # This is equivalent to pipeline_sm_stats.producer_tail
         block_info: BlockInfo,
         num_splits: Int32,
         SeqlenInfoCls: Callable,
         blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        tile_scheduler=None,
     ):
         tidx = cute.arch.thread_idx()[0] % (cute.arch.WARP_SIZE * len(self.correction_warp_ids))
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
         o_corr_consumer_phase = Int32(0)
         corr_epi_producer_phase = Int32(1)
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
                 mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx, split_idx]
             else:
                 mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx]
+            gO = None
+            if const_expr(self.use_tma_O or not self.pack_gqa):
+                tiler_gO = ((self.mma_tiler_pv[0] * self.q_stage), self.head_dim_v_padded)
+                gO = cute.local_tile(mO_cur, tiler_gO, (m_block, 0))  # (128 * 2, 128)
+                gO = layout_utils.select(
+                    cute.flat_divide(gO, (self.mma_tiler_pv[0],)), mode=[0, 2, 1]
+                )  # (128, 128, 2)
+                gO = cute.flat_divide(gO, (self.mma_tiler_pv[0] // self.cta_group_size,))[None, mma_tile_coord_v, None, None]
             # Default LSE to -inf for invalid split_idx tiles
             stats = [(0.0, -Float32.inf if const_expr(mLSE is not None or learnable_sink is not None) else None, True)] * self.q_stage
                     pipeline_o_acc.consumer_wait_w_index_phase(stage, o_corr_consumer_phase)
                     if const_expr(not self.use_correction_warps_for_epi):
                         pipeline_o_epi.producer_acquire_w_index_phase(stage, corr_epi_producer_phase)
+                    gO_stage = gO[None, None, stage] if const_expr(gO is not None) else None
                     self.correction_epilogue(
                         thr_mma_pv,
                         tOtO[None, None, None, stage],
                         scale,
                         sO[None, None, stage],
                         mO_cur,
+                        gO_stage,
                         gmem_tiled_copy_O,
                     )
                     # Signal for the next work tile that O buffers in tmem are already read, so
                         mLSE_cur = cute.domain_offset((offset,), mLSE[None, head_idx])
                 for stage in cutlass.range_constexpr(self.q_stage):
                     m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
                     row_sum, row_max, acc_O_mn_row_is_zero_or_nan = stats[stage]
                     # if tidx == 0 and stage <= 1:
                     #     cute.printf("row_sum = {}, row_max = {}, acc_O_mn_row_is_zero_or_nan = {}\n", row_sum, row_max, acc_O_mn_row_is_zero_or_nan)
                         if const_expr(not self.pack_gqa)
                         else seqlen.seqlen_q * self.qhead_per_kvhead
                     )
+                    if const_expr(not self.pack_gqa or self.m_block_size % self.qhead_per_kvhead == 0):
+                        gLSE = cute.local_tile(mLSE_cur, (self.m_block_size,), (m_tile_idx,))
+                        if tidx < seqlen_q - m_tile_idx * self.m_block_size:
+                            # This actually just works with PackGQA too
+                            gLSE[tidx] = lse
+                    else:
+                        idx = m_tile_idx * self.m_block_size + tidx
+                        if idx < seqlen_q:
+                            m_idx = idx // self.qhead_per_kvhead
+                            h_idx = idx - m_idx * self.qhead_per_kvhead
+                            lse_ptr_i64 = utils.elem_pointer(mLSE_cur, ((h_idx, m_idx),)).toint()
+                            lse_gmem_ptr = cute.make_ptr(
+                                mLSE_cur.element_type, lse_ptr_i64, cute.AddressSpace.gmem, assumed_align=4
+                            )
+                            cute.make_tensor(lse_gmem_ptr, (1,))[0] = lse
             # Advance to next tile
+            work_tile = tile_scheduler.advance_to_next_work()
         # End of persistent scheduler loop
         # This is equivalent to pipeline_o_epi.consumer_tail() for the correction warps
         if const_expr(self.use_correction_warps_for_epi):
             assert(not self.use_tma_O)
             assert(gmem_tiled_copy_O is not None)
+            cute.arch.barrier(barrier_id=int(NamedBarrierFwdSm100.Epilogue),
                               number_of_threads=len(self.epilogue_warp_ids) * cute.arch.WARP_SIZE)
             mma_tile_coord_v = thr_mma.thr_idx
             m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
     def _store_O_to_gmem(
         self,
         sO_stage: cute.Tensor,
+        gO: Optional[cute.Tensor],
         mO_cur: cute.Tensor,
         gmem_tiled_copy_O: cute.TiledCopy,
         tidx: Int32,
         gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx)
         tOsO = gmem_thr_copy_O.partition_S(sO_stage)
         cO = cute.make_identity_tensor((self.m_block_size, self.head_dim_v_padded))
         tOcO = gmem_thr_copy_O.partition_S(cO)
         t0OcO = gmem_tiled_copy_O.get_slice(0).partition_S(cO)
         tOpO = copy_utils.predicate_k(tOcO, limit=mO_cur.shape[1])
         cute.autovec_copy(tOsO, tOrO)
         # copy acc O from rmem to gmem
         if const_expr(not self.pack_gqa):
+            assert gO is not None
+            tOgO = gmem_thr_copy_O.partition_D(gO)
             for rest_m in cutlass.range_constexpr(cute.size(tOrO.shape[1])):
                 if (
                     t0OcO[0, rest_m, 0][0] < seqlen_q - m_tile_idx * self.m_block_size - tOcO[0][0]
         block_info: BlockInfo,
         num_splits: int,
         SeqlenInfoCls: Callable,
         mma_tile_coord_v: Int32 = 0,
+        tile_scheduler=None,
     ):
         epi_consumer_phase = Int32(0)
         work_tile = tile_scheduler.initial_work_tile_info()
         while work_tile.is_valid_tile:
             m_block, head_idx, batch_idx, split_idx = work_tile.tile_idx
                     mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx, split_idx]
                 else:
                     mO_cur = seqlen.offset_batch_Q(mO, batch_idx, dim=3)[None, None, head_idx]
+                gO = None
+                if const_expr(self.use_tma_O or not self.pack_gqa):
+                    tiler_gO = ((self.mma_tiler_pv[0] * self.q_stage), self.head_dim_v_padded)
+                    gO = cute.local_tile(mO_cur, tiler_gO, (m_block, 0))  # (128 * 2, 128)
+                    gO = layout_utils.select(
+                        cute.flat_divide(gO, (self.mma_tiler_pv[0],)), mode=[0, 2, 1]
+                    )  # (128, 128, 2)
+                    gO = cute.flat_divide(gO, (self.mma_tiler_pv[0] // self.cta_group_size,))[None, mma_tile_coord_v, None, None]
                 if const_expr(self.use_tma_O):
                     store_O, _, _ = copy_utils.tma_get_copy_fn(
                         pipeline_o_epi.consumer_wait_w_index_phase(stage, epi_consumer_phase)
                         # 2. copy O0 / O1 to gmem
                         m_tile_idx = (m_block * self.q_stage + stage) * self.cta_group_size + mma_tile_coord_v
+                        gO_stage = gO[None, None, stage] if const_expr(gO is not None) else None
                         self._store_O_to_gmem(
+                            sO[None, None, stage], gO_stage, mO_cur, gmem_tiled_copy_O,
                             tidx, seqlen.seqlen_q, m_tile_idx,
                         )
                         pipeline_o_epi.consumer_release_w_index(stage)
                 epi_consumer_phase ^= 1
             # Advance to next tile
+            work_tile = tile_scheduler.advance_to_next_work()
+    @cute.jit
+    def clc_scheduler_warp(
+        self,
+        tile_scheduler: TileSchedulerProtocol,
+    ):
+        work_tile = tile_scheduler.initial_work_tile_info()
+        while work_tile.is_valid_tile:
+            tile_scheduler.prefetch_next_work()
+            work_tile = tile_scheduler.advance_to_next_work()
+            if cute.arch.thread_idx()[0] == self.clc_scheduler_warp_id * cute.arch.WARP_SIZE:
+                fa_printf(
+                    3,
+                    "[CLC] query sm={} cta={} (m_blk={},h={},b={},s={}) valid={}\n",
+                    smid(),
+                    cute.arch.block_idx()[0],
+                    work_tile.tile_idx[0],
+                    work_tile.tile_idx[1],
+                    work_tile.tile_idx[2],
+                    work_tile.tile_idx[3],
+                    work_tile.is_valid_tile,
+                )
+        tile_scheduler.producer_tail()
+    @cute.jit
+    def empty_warp(
+        self,
+        tile_scheduler: TileSchedulerProtocol,
+    ):
+        work_tile = tile_scheduler.initial_work_tile_info()
+        while work_tile.is_valid_tile:
+            work_tile = tile_scheduler.advance_to_next_work()
     def load_Q(
         self,
         pipeline_q.producer_acquire_w_index_phase(stage, phase)
         load_Q_fn(src_idx=block, dst_idx=stage, tma_bar_ptr=pipeline_q.sync_object_full.get_barrier(stage))
+    def load_Q_non_tma(
+        self,
+        mQ: cute.Tensor,
+        sQ: cute.Tensor,
+        gmem_tiled_copy_Q: cute.TiledCopy,
+        pipeline_q: pipeline.PipelineAsync,
+        tidx: Int32,
+        seqlen_q: Int32,
+        m_block: Int32,
+        block: Int32,
+        stage: int,
+        phase: Int32,
+    ):
+        assert self.cta_group_size == 1, "cta_group_size must be 1 for non-tma Q load"
+        pipeline_q.producer_acquire_w_index_phase(stage, phase)
+        pack_gqa = PackGQA(
+            self.m_block_size,
+            self.head_dim_padded,
+            self.check_hdim_oob,
+            self.qhead_per_kvhead,
+        )
+        sQ_stage = sQ[None, None, None, stage]
+        sQ_pi = cute.make_tensor(
+            sQ_stage.iterator,
+            cute.make_layout(
+                (sQ_stage.shape[0][0], (sQ_stage.shape[0][1], sQ_stage.shape[2])),
+                stride=(sQ_stage.stride[0][0], (sQ_stage.stride[0][1], sQ_stage.stride[2])),
+            ),
+        )
+        pack_gqa.load_Q(mQ, sQ_pi, gmem_tiled_copy_Q, tidx, m_block * self.q_stage + block, seqlen_q)
+        cute.arch.cp_async_commit_group()
+        pipeline_q.sync_object_full.arrive_cp_async_mbarrier(stage)
     @cute.jit
     def load_KV(
         self,
         else:
             assert paged_kv_manager is not None
             assert extra_tx_count is None
+            sX_cur = sX[None, None, None, stage]
+            if const_expr(self.uneven_kv_smem):
+                sX_cur = self.offset_kv_smem(sX_cur, stage, phase ^ 1)
+            paged_kv_manager.load_KV(block, sX_cur, K_or_V)
             cute.arch.cp_async_commit_group()
             pipeline_kv.sync_object_full.arrive_cp_async_mbarrier(stage)
             # (smem_large + smem_small) // 2. So for stage == 1, move right by offset if
             # phase == 0, or left by offset if phase == 1.
             offset = 0 if stage != 1 else self.uneven_kv_smem_offset * (1 - 2 * phase)
+            # Hint that the offset is 128-bit aligned so that
+            # ptr + offset preserves the alignment needed by cp.async.
+            offset = cute.assume(offset, divby=128 // self.k_dtype.width)
             return cute.make_tensor(sX.iterator + offset, sX.layout)
         else:
             return sX
     #     warp_group_idx = utils.canonical_warp_group_idx(sync=False)
     #     if warp_group_idx == 0:
     #         cute.arch.barrier_arrive(
+    #             barrier_id=int(NamedBarrierFwdSm100.WarpSchedulerWG1), number_of_threads=2 * 128,
     #         )
     # def warp_scheduler_barrier_sync(self):
     #     cute.arch.barrier(
+    #         barrier_id=int(NamedBarrierFwdSm100.WarpSchedulerWG1) + utils.canonical_warp_group_idx(sync=False),
     #         number_of_threads=2 * 128
     #     )
     #     cur_wg = utils.canonical_warp_group_idx(sync=False)
     #     next_wg = 1 - cur_wg
     #     cute.arch.barrier_arrive(
+    #         barrier_id=int(NamedBarrierFwdSm100.WarpSchedulerWG1) + next_wg, number_of_threads=2 * 128,
     #     )
     @cute.jit

build/torch-cuda/flash_fwd_sm120.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# SM120 (Blackwell GeForce / DGX Spark) forward pass.
+#
+# SM120 uses the same SM80-era MMA instructions (mma.sync.aligned.m16n8k16) but has
+# a smaller shared memory capacity (99 KB vs 163 KB on SM80). This module subclasses
+# FlashAttentionForwardSm80 and overrides the SMEM capacity check accordingly.
+import cutlass
+import cutlass.utils as utils_basic
+from .flash_fwd import FlashAttentionForwardSm80
+class FlashAttentionForwardSm120(FlashAttentionForwardSm80):
+    # Keep arch = 80 to use CpAsync code paths (no TMA for output).
+    # The compilation target is determined by the GPU at compile time, not this field.
+    arch = 80
+    @staticmethod
+    def can_implement(
+        dtype,
+        head_dim,
+        head_dim_v,
+        tile_m,
+        tile_n,
+        num_stages,
+        num_threads,
+        is_causal,
+        Q_in_regs=False,
+    ) -> bool:
+        """Check if the kernel can be implemented on SM120.
+        Same logic as SM80 but uses SM120's shared memory capacity (99 KB).
+        """
+        if dtype not in [cutlass.Float16, cutlass.BFloat16]:
+            return False
+        if head_dim % 8 != 0:
+            return False
+        if head_dim_v % 8 != 0:
+            return False
+        if tile_n % 16 != 0:
+            return False
+        if num_threads % 32 != 0:
+            return False
+        # Shared memory usage: Q tile + (K tile + V tile)
+        smem_usage_Q = tile_m * head_dim * 2
+        smem_usage_K = tile_n * head_dim * num_stages * 2
+        smem_usage_V = tile_n * head_dim_v * num_stages * 2
+        smem_usage_QV = (
+            (smem_usage_Q + smem_usage_V) if not Q_in_regs else max(smem_usage_Q, smem_usage_V)
+        )
+        smem_usage = smem_usage_QV + smem_usage_K
+        # SM120 has 99 KB shared memory (vs 163 KB on SM80)
+        smem_capacity = utils_basic.get_smem_capacity_in_bytes("sm_120")
+        if smem_usage > smem_capacity:
+            return False
+        if (tile_m * 2) % num_threads != 0:
+            return False
+        return True

build/torch-cuda/flash_fwd_sm90.py ADDED Viewed

	@@ -0,0 +1,1534 @@

+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# SM90 (Hopper) forward pass for flash attention, extracted from flash_fwd.py.
+from types import SimpleNamespace
+from typing import Callable, Literal, Optional
+from functools import partial
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute.nvgpu import cpasync, warpgroup
+from cutlass.utils import LayoutEnum
+import cutlass.utils.hopper_helpers as sm90_utils_basic
+from cutlass import pipeline
+from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
+from cutlass.base_dsl.arch import Arch
+from .quack import copy_utils
+from .quack import layout_utils
+from .quack import sm90_utils
+from .cute_dsl_utils import assume_tensor_aligned
+from . import utils
+from .mask import AttentionMask
+from .softmax import Softmax, apply_score_mod_inner
+from .seqlen_info import SeqlenInfoQK
+from .block_info import BlockInfo
+from .block_sparsity import BlockSparseTensors
+from .block_sparse_utils import (
+    produce_block_sparse_loads,
+    consume_block_sparse_loads,
+)
+from . import pipeline as pipeline_custom
+from .pack_gqa import PackGQA, pack_gqa_layout, make_packgqa_tiled_tma_atom
+from .paged_kv import PagedKVManager
+from .named_barrier import NamedBarrierFwd
+from .quack.cute_dsl_utils import ParamsBase
+from .tile_scheduler import (
+    TileSchedulerArguments,
+    SingleTileScheduler,
+    SingleTileLPTScheduler,
+    SingleTileVarlenScheduler,
+)
+from cutlass.cute import FastDivmodDivisor
+from .flash_fwd import FlashAttentionForwardBase
+class FlashAttentionForwardSm90(FlashAttentionForwardBase):
+    def __init__(
+        self,
+        *args,
+        intra_wg_overlap: bool = True,
+        mma_pv_is_rs: bool = True,
+        paged_kv_non_tma: bool = False,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.intra_wg_overlap = intra_wg_overlap
+        self.mma_pv_is_rs = mma_pv_is_rs
+        self.buffer_align_bytes = 1024
+        self.use_tma_KV = not paged_kv_non_tma
+        assert self.use_tma_KV or not (self.check_hdim_oob or self.check_hdim_v_oob), (
+            "Paged KV does not support irregular head dim"
+        )
+        self.cluster_shape_mn = (1, 1)
+        assert self.arch >= Arch.sm_90 and self.arch <= Arch.sm_90a, "Only SM 9.x is supported"
+    def _get_smem_layout_atom(self):
+        sQ_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils_basic.get_smem_layout_atom(LayoutEnum.ROW_MAJOR, self.dtype, self.tile_hdim),
+            self.dtype,
+        )
+        sK_layout_atom = sQ_layout_atom
+        sV_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils_basic.get_smem_layout_atom(
+                LayoutEnum.ROW_MAJOR, self.dtype, self.tile_hdimv
+            ),
+            self.dtype,
+        )
+        sO_layout_atom = sV_layout_atom
+        if not self.mma_pv_is_rs:
+            sP_layout_atom = warpgroup.make_smem_layout_atom(
+                sm90_utils_basic.get_smem_layout_atom(
+                    LayoutEnum.ROW_MAJOR, self.dtype, self.tile_n
+                ),
+                self.dtype,
+            )
+        else:
+            sP_layout_atom = None
+        return sQ_layout_atom, sK_layout_atom, sV_layout_atom, sO_layout_atom, sP_layout_atom
+    def _get_tiled_mma(self):
+        tiled_mma_qk = sm90_utils_basic.make_trivial_tiled_mma(
+            self.dtype,
+            self.dtype,
+            warpgroup.OperandMajorMode.K,
+            warpgroup.OperandMajorMode.K,
+            Float32,
+            atom_layout_mnk=(self.tile_m // 64, 1, 1),
+            tiler_mn=(64, self.tile_n),
+        )
+        tiled_mma_pv = sm90_utils_basic.make_trivial_tiled_mma(
+            self.dtype,
+            self.dtype,
+            warpgroup.OperandMajorMode.K,
+            warpgroup.OperandMajorMode.MN,
+            Float32,
+            atom_layout_mnk=(self.tile_m // 64, 1, 1),  # Might need (1, 2, 1) for hdim 512
+            tiler_mn=(64, self.tile_hdimv),
+            a_source=warpgroup.OperandSource.RMEM
+            if self.mma_pv_is_rs
+            else warpgroup.OperandSource.SMEM,
+        )
+        return tiled_mma_qk, tiled_mma_pv
+    def _get_shared_storage_cls(self):
+        sQ_struct, sK_struct, sV_struct = [
+            cute.struct.Align[
+                cute.struct.MemRange[self.dtype, cute.cosize(layout)], self.buffer_align_bytes
+            ]
+            for layout in (self.sQ_layout, self.sK_layout, self.sV_layout)
+        ]
+        cosize_sQV = max(cute.cosize(self.sQ_layout), cute.cosize(self.sV_layout))
+        sQV_struct = cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sQV], 1024]
+        cosize_sP = cute.cosize(self.sP_layout) if const_expr(self.sP_layout is not None) else 0
+        sP_struct = cute.struct.Align[cute.struct.MemRange[self.dtype, cosize_sP], 1024]
+        # 1 stage * 2 for Q pipeline (full + empty), self.num_stages*2 for K, self.num_stages*2 for V,
+        mbar_ptr_Q_struct = cute.struct.MemRange[cutlass.Int64, 1 * 2]
+        mbar_ptr_K_struct = cute.struct.MemRange[cutlass.Int64, self.num_stages * 2]
+        mbar_ptr_V_struct = cute.struct.MemRange[cutlass.Int64, self.num_stages * 2]
+        @cute.struct
+        class SharedStorageQKV:
+            mbar_ptr_Q: mbar_ptr_Q_struct
+            mbar_ptr_K: mbar_ptr_K_struct
+            mbar_ptr_V: mbar_ptr_V_struct
+            sV: sV_struct
+            sQ: sQ_struct
+            sK: sK_struct
+            sP: sP_struct
+        @cute.struct
+        class SharedStorageSharedQV:
+            mbar_ptr_Q: mbar_ptr_Q_struct
+            mbar_ptr_K: mbar_ptr_K_struct
+            mbar_ptr_V: mbar_ptr_V_struct
+            sQ: sQV_struct
+            sK: sK_struct
+            sP: sP_struct
+        return SharedStorageQKV if const_expr(not self.Q_in_regs) else SharedStorageSharedQV
+    @cute.jit
+    def __call__(
+        self,
+        mQ: cute.Tensor,  # (b, s_q, h, d) or (total_q, h, d) if there is cu_seqlens_q
+        mK: cute.Tensor,  # (b_k, s_k, h_k, d) or (total_k, h_k, d) if there is cu_seqlens_k or (num_pages, page_size, h_k, d) if there is page_table
+        mV: cute.Tensor,  # (b_k, s_k, h_k, dv) or (total_k, h_k, dv) if there is cu_seqlens_k or (num_pages, page_size, h_k, dv) if there is page_table
+        mO: cute.Tensor,  # (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
+        mLSE: Optional[cute.Tensor],
+        softmax_scale: Float32,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        mPageTable: Optional[cute.Tensor] = None,  # (b_k, max_num_pages_per_seq)
+        window_size_left: Int32 | int | None = None,
+        window_size_right: Int32 | int | None = None,
+        learnable_sink: Optional[cute.Tensor] = None,
+        blocksparse_tensors: Optional[BlockSparseTensors] = None,
+        aux_tensors: Optional[list] = None,
+        # Always keep stream as the last parameter (EnvStream: obtained implicitly via TVM FFI).
+        stream: cuda.CUstream = None,
+    ):
+        """Configures and launches the flash attention kernel.
+        mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout:
+        (batch_size, seqlen_q, num_head, head_dim):(_, _, _, 1)
+        """
+        self._check_type(
+            *(
+                t.element_type if t is not None else None
+                for t in (mQ, mK, mV, mO, mLSE, mCuSeqlensQ, mCuSeqlensK, mSeqUsedQ, mSeqUsedK)
+            )
+        )
+        self.varlen_q = mCuSeqlensQ is not None or mSeqUsedQ is not None
+        mQ, mK, mV, mO = [assume_tensor_aligned(t) for t in (mQ, mK, mV, mO)]
+        QO_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensQ is None) else [0, 2, 1]
+        mQ, mO = [layout_utils.select(t, QO_layout_transpose) for t in (mQ, mO)]
+        KV_layout_transpose = [1, 3, 2, 0] if const_expr(mCuSeqlensK is None) else [0, 2, 1]
+        mK, mV = [layout_utils.select(t, KV_layout_transpose) for t in (mK, mV)]
+        LSE_layout_transpose = [2, 1, 0] if const_expr(mCuSeqlensQ is None) else [1, 0]
+        mLSE = (
+            layout_utils.select(mLSE, LSE_layout_transpose)
+            if const_expr(mLSE is not None)
+            else None
+        )
+        tiled_mma_qk, tiled_mma_pv = self._get_tiled_mma()
+        self.num_mma_threads = tiled_mma_qk.size
+        self.num_threads_per_warp_group = 128
+        self.num_wg_mma = self.num_mma_threads // self.num_threads_per_warp_group
+        assert self.num_wg_mma in [1, 2, 3]
+        self.num_threads = self.num_threads_per_warp_group * (self.num_wg_mma + 1)
+        self.num_producer_threads = 32
+        self.num_Q_load_threads = self.num_threads_per_warp_group  # If not TMA_Q
+        self.num_epilogue_threads = self.num_mma_threads
+        self.num_mma_regs, self.num_producer_regs = {1: (256, 56), 2: (240, 24), 3: (160, 32)}[
+            self.num_wg_mma
+        ]
+        self.use_block_sparsity = cutlass.const_expr(blocksparse_tensors is not None)
+        self.use_scheduler_barrier = (
+            (self.num_wg_mma >= 2 and self.tile_hdim <= 128)
+            if const_expr(self.intra_wg_overlap)
+            else (self.num_wg_mma == 2)
+        )
+        self.use_tma_Q = self.arch >= Arch.sm_90 and not (
+            self.pack_gqa and self.tile_m % self.qhead_per_kvhead != 0
+        )
+        self.use_tma_O = self.use_tma_Q
+        # Producer needs more registers when doing cp.async Q or KV loads
+        if const_expr(self.num_wg_mma == 2 and (not self.use_tma_Q or not self.use_tma_KV)):
+            self.num_mma_regs, self.num_producer_regs = 224, 40
+        self.rescale_O_before_gemm = self.tile_hdimv > 128 and self.intra_wg_overlap
+        self._setup_attributes()
+        # TODO: we prob don't need most of what's in _setup_attributes
+        self.sQ_layout, self.sK_layout, self.sV_layout, self.sO_layout = [
+            sm90_utils.make_smem_layout(mX.element_type, LayoutEnum.ROW_MAJOR, shape, stage)
+            for mX, shape, stage in [
+                (mQ, (self.tile_m, self.tile_hdim), None),
+                (mK, (self.tile_n, self.tile_hdim), self.num_stages),
+                (mV, (self.tile_n, self.tile_hdimv), self.num_stages),
+                (mO, (self.tile_m, self.tile_hdimv), None),
+            ]
+        ]
+        self.sP_layout = None
+        if const_expr(not self.mma_pv_is_rs):
+            self.sP_layout = sm90_utils.make_smem_layout(
+                mV.element_type, LayoutEnum.ROW_MAJOR, (self.tile_m, self.tile_n)
+            )
+        SharedStorage = self._get_shared_storage_cls()
+        mQ_og, mO_og = mQ, mO
+        if const_expr(self.pack_gqa):
+            nheads_kv = mK.shape[2]
+            mQ = pack_gqa_layout(mQ, self.qhead_per_kvhead, nheads_kv, head_idx=2)
+            mO = pack_gqa_layout(mO, self.qhead_per_kvhead, nheads_kv, head_idx=2)
+            if const_expr(mLSE is not None):
+                mLSE = pack_gqa_layout(mLSE, self.qhead_per_kvhead, nheads_kv, head_idx=1)
+        # TMA
+        gmem_tiled_copy_Q = cpasync.CopyBulkTensorTileG2SOp()
+        gmem_tiled_copy_KV = cpasync.CopyBulkTensorTileG2SOp()  # Might multicast
+        gmem_tiled_copy_O = cpasync.CopyBulkTensorTileS2GOp()
+        self.tma_copy_bytes = {
+            name: cute.size_in_bytes(mX.element_type, cute.select(layout, mode=[0, 1]))
+            for name, mX, layout in [
+                ("Q", mQ, self.sQ_layout),
+                ("K", mK, self.sK_layout),
+                ("V", mV, self.sV_layout),
+            ]
+        }
+        make_tiled_tma_atom_fn = (
+            partial(make_packgqa_tiled_tma_atom, qhead_per_kvhead=self.qhead_per_kvhead, head_idx=2)
+            if const_expr(self.pack_gqa)
+            else cpasync.make_tiled_tma_atom
+        )
+        tma_atom_Q, tma_tensor_Q = None, None
+        if const_expr(self.use_tma_Q):
+            tma_atom_Q, tma_tensor_Q = make_tiled_tma_atom_fn(
+                gmem_tiled_copy_Q,
+                mQ_og if const_expr(self.pack_gqa) else mQ,
+                self.sQ_layout,
+                (self.tile_m, self.tile_hdim),  # No mcast
+            )
+        tma_atom_K, tma_tensor_K = None, None
+        tma_atom_V, tma_tensor_V = None, None
+        if const_expr(self.use_tma_KV):
+            tma_atom_K, tma_tensor_K = cpasync.make_tiled_tma_atom(
+                gmem_tiled_copy_KV,
+                mK,
+                cute.select(self.sK_layout, mode=[0, 1]),
+                (self.tile_n, self.tile_hdim),
+                1,  # No mcast for now
+            )
+            tma_atom_V, tma_tensor_V = cpasync.make_tiled_tma_atom(
+                gmem_tiled_copy_KV,
+                mV,
+                cute.select(self.sV_layout, mode=[0, 1]),
+                (self.tile_n, self.tile_hdimv),
+                1,  # No mcast for now
+            )
+        tma_atom_O, tma_tensor_O = None, None
+        if const_expr(self.use_tma_O):
+            mO_tma = mO_og if const_expr(self.pack_gqa) else mO
+            if const_expr(self.varlen_q):
+                mO_tma = copy_utils.create_ragged_tensor_for_tma(
+                    mO_tma, ragged_dim=0, ptr_shift=True
+                )
+            tma_atom_O, tma_tensor_O = make_tiled_tma_atom_fn(
+                gmem_tiled_copy_O,
+                mO_tma,
+                self.sO_layout,
+                (self.tile_m, self.tile_hdimv),  # No mcast
+            )
+        if const_expr(mCuSeqlensQ is not None or mSeqUsedQ is not None):
+            TileScheduler = SingleTileVarlenScheduler
+        else:
+            TileScheduler = (
+                SingleTileScheduler
+                if const_expr(not self.is_causal or self.is_local)
+                else SingleTileLPTScheduler
+            )
+        tile_sched_args = TileSchedulerArguments(
+            cute.ceil_div(cute.size(mQ.shape[0]), self.tile_m),
+            cute.size(mQ.shape[2]),
+            cute.size(mQ.shape[3])
+            if const_expr(mCuSeqlensQ is None)
+            else cute.size(mCuSeqlensQ.shape[0] - 1),
+            1,  # num_splits
+            cute.size(mK.shape[0])
+            if const_expr(mPageTable is None)
+            else mK.shape[0] * mPageTable.shape[1],
+            mQ.shape[1],
+            mV.shape[1],
+            total_q=cute.size(mQ.shape[0])
+            if const_expr(mCuSeqlensQ is not None)
+            else cute.size(mQ.shape[0]) * cute.size(mQ.shape[3]),
+            tile_shape_mn=(self.tile_m, self.tile_n),
+            mCuSeqlensQ=mCuSeqlensQ,
+            mSeqUsedQ=mSeqUsedQ,
+            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+            element_size=self.dtype.width // 8,
+            is_persistent=False,
+            lpt=self.is_causal or self.is_local,
+        )
+        tile_sched_params = TileScheduler.to_underlying_arguments(tile_sched_args)
+        grid_dim = TileScheduler.get_grid_shape(tile_sched_params)
+        softmax_scale_log2, softmax_scale = utils.compute_softmax_scale_log2(
+            softmax_scale, self.score_mod
+        )
+        window_size_left = Int32(window_size_left) if window_size_left is not None else None
+        window_size_right = Int32(window_size_right) if window_size_right is not None else None
+        fastdiv_mods = utils.compute_fastdiv_mods(
+            mQ, mK, self.qhead_per_kvhead, self.pack_gqa, aux_tensors, mPageTable
+        )
+        self.kernel(
+            tma_tensor_Q if const_expr(self.use_tma_Q) else mQ,
+            tma_tensor_K if const_expr(self.use_tma_KV) else mK,
+            tma_tensor_V if const_expr(self.use_tma_KV) else mV,
+            tma_tensor_O if const_expr(self.use_tma_O) else mO,
+            mLSE,
+            mCuSeqlensQ,
+            mCuSeqlensK,
+            mSeqUsedQ,
+            mSeqUsedK,
+            mPageTable,
+            tma_atom_Q,
+            tma_atom_K,
+            tma_atom_V,
+            tma_atom_O,
+            softmax_scale_log2,
+            softmax_scale,
+            window_size_left,
+            window_size_right,
+            learnable_sink,
+            blocksparse_tensors,
+            self.sQ_layout,
+            self.sK_layout,
+            self.sV_layout,
+            self.sO_layout,
+            self.sP_layout,
+            self.gmem_tiled_copy_Q,
+            self.gmem_tiled_copy_K,
+            self.gmem_tiled_copy_V,
+            self.gmem_tiled_copy_O,
+            tiled_mma_qk,
+            tiled_mma_pv,
+            tile_sched_params,
+            TileScheduler,
+            SharedStorage,
+            aux_tensors,
+            fastdiv_mods,
+        ).launch(
+            grid=grid_dim,
+            block=[self.num_threads, 1, 1],
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        mO: cute.Tensor,
+        mLSE: Optional[cute.Tensor],
+        mCuSeqlensQ: Optional[cute.Tensor],
+        mCuSeqlensK: Optional[cute.Tensor],
+        mSeqUsedQ: Optional[cute.Tensor],
+        mSeqUsedK: Optional[cute.Tensor],
+        mPageTable: Optional[cute.Tensor],
+        tma_atom_Q: Optional[cute.CopyAtom],
+        tma_atom_K: Optional[cute.CopyAtom],
+        tma_atom_V: Optional[cute.CopyAtom],
+        tma_atom_O: Optional[cute.CopyAtom],
+        softmax_scale_log2: Float32,
+        softmax_scale: Optional[Float32],
+        window_size_left: Optional[Int32],
+        window_size_right: Optional[Int32],
+        learnable_sink: Optional[cute.Tensor],
+        blocksparse_tensors: Optional[BlockSparseTensors],
+        sQ_layout: cute.ComposedLayout,
+        sK_layout: cute.ComposedLayout,
+        sV_layout: cute.ComposedLayout,
+        sO_layout: cute.ComposedLayout,
+        sP_layout: cute.ComposedLayout | None,
+        gmem_tiled_copy_Q: cute.TiledCopy,
+        gmem_tiled_copy_K: cute.TiledCopy,
+        gmem_tiled_copy_V: cute.TiledCopy,
+        gmem_tiled_copy_O: cute.TiledCopy,
+        tiled_mma_qk: cute.TiledMma,
+        tiled_mma_pv: cute.TiledMma,
+        tile_sched_params: ParamsBase,
+        TileScheduler: cutlass.Constexpr[Callable],
+        SharedStorage: cutlass.Constexpr[Callable],
+        aux_tensors=Optional[list[cute.Tensor]],
+        fastdiv_mods=None,
+    ):
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # Prefetch tma descriptor
+        if warp_idx == 0:
+            for tma_atom in (tma_atom_Q, tma_atom_K, tma_atom_V, tma_atom_O):
+                if const_expr(tma_atom is not None):
+                    cpasync.prefetch_descriptor(tma_atom)
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(SharedStorage)
+        # Mbarrier / pipeline init
+        mbar_ptr_Q = storage.mbar_ptr_Q.data_ptr()
+        ThreadCooperativeGroup = partial(pipeline.CooperativeGroup, pipeline.Agent.Thread)
+        tma_warp = ThreadCooperativeGroup(1)
+        load_threads = ThreadCooperativeGroup(self.num_threads_per_warp_group)
+        mma_warps = ThreadCooperativeGroup(self.num_mma_threads // cute.arch.WARP_SIZE)
+        if const_expr(self.use_tma_Q):
+            pipeline_q = pipeline_custom.PipelineTmaAsync.create(
+                barrier_storage=mbar_ptr_Q,
+                num_stages=1,
+                producer_group=tma_warp,
+                consumer_group=mma_warps,
+                tx_count=self.tma_copy_bytes["Q"],
+                defer_sync=True,
+            )
+        else:
+            pipeline_q = pipeline_custom.PipelineCpAsync.create(
+                barrier_storage=mbar_ptr_Q,
+                num_stages=1,
+                producer_group=load_threads,
+                consumer_group=mma_warps,
+                defer_sync=True,
+                elect_one_release=True,
+                syncwarp_before_release=False,
+            )
+        if const_expr(self.use_tma_KV):
+            pipeline_k = pipeline_custom.PipelineTmaAsync.create(
+                barrier_storage=storage.mbar_ptr_K.data_ptr(),
+                num_stages=self.num_stages,
+                producer_group=tma_warp,
+                consumer_group=mma_warps,
+                tx_count=self.tma_copy_bytes["K"],
+                defer_sync=True,
+            )
+            pipeline_v = pipeline_custom.PipelineTmaAsync.create(
+                barrier_storage=storage.mbar_ptr_V.data_ptr(),
+                num_stages=self.num_stages,
+                producer_group=tma_warp,
+                consumer_group=mma_warps,
+                tx_count=self.tma_copy_bytes["V"],
+                defer_sync=True,
+            )
+        else:
+            pipeline_k = pipeline_custom.PipelineCpAsync.create(
+                barrier_storage=storage.mbar_ptr_K.data_ptr(),
+                num_stages=self.num_stages,
+                producer_group=load_threads,
+                consumer_group=mma_warps,
+                defer_sync=True,
+                elect_one_release=True,
+                syncwarp_before_release=False,
+            )
+            pipeline_v = pipeline_custom.PipelineCpAsync.create(
+                barrier_storage=storage.mbar_ptr_V.data_ptr(),
+                num_stages=self.num_stages,
+                producer_group=load_threads,
+                consumer_group=mma_warps,
+                defer_sync=True,
+                elect_one_release=True,
+                syncwarp_before_release=False,
+            )
+        # Cluster arrive after barrier init
+        pipeline_init_arrive(cluster_shape_mn=self.cluster_shape_mn, is_relaxed=True)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get shared memory buffer
+        # ///////////////////////////////////////////////////////////////////////////////
+        sQ = storage.sQ.get_tensor(sQ_layout.outer, swizzle=sQ_layout.inner)
+        sK = storage.sK.get_tensor(sK_layout.outer, swizzle=sK_layout.inner)
+        if const_expr(not self.Q_in_regs):
+            sV = storage.sV.get_tensor(sV_layout.outer, swizzle=sV_layout.inner)
+        else:
+            sV = storage.sQ.get_tensor(
+                sV_layout.outer, swizzle=sV_layout.inner, dtype=mV.element_type
+            )
+        # Transpose view of V to tensor with layout (head_dim_v, tile_n) for tiled mma
+        sVt = layout_utils.transpose_view(sV)
+        sP = None
+        if const_expr(sP_layout is not None):
+            sP = storage.sP.get_tensor(sP_layout.outer, swizzle=sP_layout.inner)
+        # reuse sQ's data iterator
+        sO = storage.sQ.get_tensor(sO_layout.outer, swizzle=sO_layout.inner, dtype=self.dtype)
+        block_info = BlockInfo(
+            self.tile_m,
+            self.tile_n,
+            self.is_causal,
+            self.is_local,
+            False,  # is_split_kv
+            window_size_left,
+            window_size_right,
+            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+        )
+        SeqlenInfoCls = partial(
+            SeqlenInfoQK.create,
+            seqlen_q_static=mQ.shape[0] if const_expr(not self.pack_gqa) else mQ.shape[0][1],
+            seqlen_k_static=mK.shape[0]
+            if const_expr(mPageTable is None)
+            else mK.shape[0] * mPageTable.shape[1],
+            mCuSeqlensQ=mCuSeqlensQ,
+            mCuSeqlensK=mCuSeqlensK,
+            mSeqUsedQ=mSeqUsedQ,
+            mSeqUsedK=mSeqUsedK,
+            # Don't need to pass in tile_mn because we won't access offset_padded
+        )
+        AttentionMaskCls = partial(
+            AttentionMask,
+            self.tile_m,
+            self.tile_n,
+            window_size_left=window_size_left,
+            window_size_right=window_size_right,
+            qhead_per_kvhead_packgqa=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+        )
+        TileSchedulerCls = partial(TileScheduler.create, tile_sched_params)
+        # Cluster wait before starting
+        pipeline_init_wait(cluster_shape_mn=self.cluster_shape_mn)
+        if warp_idx < 4:  # Producer
+            cute.arch.setmaxregister_decrease(self.num_producer_regs)
+            self.load(
+                mQ,
+                mK,
+                mV,
+                sQ,
+                sK,
+                sV,
+                tma_atom_Q,
+                tma_atom_K,
+                tma_atom_V,
+                pipeline_k,
+                pipeline_v,
+                pipeline_q,
+                gmem_tiled_copy_Q,
+                mPageTable,
+                blocksparse_tensors,
+                block_info,
+                SeqlenInfoCls,
+                TileSchedulerCls,
+            )
+        else:  # Consumer
+            cute.arch.setmaxregister_increase(self.num_mma_regs)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Tile MMA compute thread partitions and allocate accumulators
+            # ///////////////////////////////////////////////////////////////////////////////
+            tidx, _, _ = cute.arch.thread_idx()
+            tidx = tidx - 128
+            self.mma(
+                tiled_mma_qk,
+                tiled_mma_pv,
+                mO,
+                mLSE,
+                sQ,
+                sK,
+                sVt,
+                sP,
+                sO,
+                learnable_sink,
+                pipeline_k,
+                pipeline_v,
+                pipeline_q,
+                gmem_tiled_copy_O,
+                tma_atom_O,
+                tidx,
+                softmax_scale_log2,
+                softmax_scale,
+                block_info,
+                SeqlenInfoCls,
+                AttentionMaskCls,
+                TileSchedulerCls,
+                blocksparse_tensors,
+                aux_tensors,
+                fastdiv_mods,
+            )
+    @cute.jit
+    def load(
+        self,
+        mQ: cute.Tensor,
+        mK: cute.Tensor,
+        mV: cute.Tensor,
+        sQ: cute.Tensor,
+        sK: cute.Tensor,
+        sV: cute.Tensor,
+        tma_atom_Q: Optional[cute.CopyAtom],
+        tma_atom_K: Optional[cute.CopyAtom],
+        tma_atom_V: Optional[cute.CopyAtom],
+        pipeline_k: pipeline.PipelineAsync,
+        pipeline_v: pipeline.PipelineAsync,
+        pipeline_q: pipeline.PipelineAsync,
+        gmem_tiled_copy_Q: cute.TiledCopy,
+        mPageTable: Optional[cute.Tensor],
+        blocksparse_tensors: Optional[BlockSparseTensors],
+        block_info: BlockInfo,
+        SeqlenInfoCls: Callable,
+        TileSchedulerCls: Callable,
+    ):
+        warp_idx_in_wg = cute.arch.make_warp_uniform(cute.arch.warp_idx()) % 4
+        tidx, _, _ = cute.arch.thread_idx()
+        # TMA: only warp 0 loads. cp_async: all warps load.
+        # When not use_tma_Q, all 128 producer threads participate in Q loading.
+        is_load_warp = warp_idx_in_wg == 0 or const_expr(not self.use_tma_KV or not self.use_tma_Q)
+        # KV loading restricted to warp 0 for TMA, all warps for non-TMA KV
+        is_kv_load_warp = warp_idx_in_wg == 0 or const_expr(not self.use_tma_KV)
+        if is_load_warp:
+            q_producer_phase = Int32(1)
+            kv_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_stages
+            )
+            tile_scheduler = TileSchedulerCls()
+            work_tile = tile_scheduler.initial_work_tile_info()
+            while work_tile.is_valid_tile:
+                # if work_tile.is_valid_tile:
+                m_block, head_idx, batch_idx, _ = work_tile.tile_idx
+                seqlen = SeqlenInfoCls(batch_idx)
+                mQ_cur = seqlen.offset_batch_Q(mQ, batch_idx, dim=3)[None, None, head_idx]
+                head_idx_kv = (
+                    head_idx // self.qhead_per_kvhead if const_expr(not self.pack_gqa) else head_idx
+                )
+                load_Q = None
+                if const_expr(self.use_tma_Q):
+                    gQ = cute.local_tile(mQ_cur, (self.tile_m, self.tile_hdim), (m_block, 0))
+                    load_Q, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_Q, 0, cute.make_layout(1), gQ, sQ, single_stage=True
+                    )
+                paged_kv_manager = None
+                tma_load_K_fn = None
+                tma_load_V_fn = None
+                if const_expr(self.use_tma_KV):
+                    # === TMA path (non-paged and paged with page_size == n_block_size) ===
+                    if const_expr(mPageTable is not None):
+                        # Paged TMA: keep page dimension indexable
+                        mK_cur = mK[None, None, head_idx_kv, None]
+                        mV_cur = mV[None, None, head_idx_kv, None]
+                        gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (0, 0, None))
+                        gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (0, 0, None))
+                    else:
+                        # Non-paged TMA
+                        mK_cur = seqlen.offset_batch_K(mK, batch_idx, dim=3)[
+                            None, None, head_idx_kv
+                        ]
+                        mV_cur = seqlen.offset_batch_K(mV, batch_idx, dim=3)[
+                            None, None, head_idx_kv
+                        ]
+                        gK = cute.local_tile(mK_cur, (self.tile_n, self.tile_hdim), (None, 0))
+                        gV = cute.local_tile(mV_cur, (self.tile_n, self.tile_hdimv), (None, 0))
+                    # TODO: mcast
+                    tma_load_K_fn, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_K, 0, cute.make_layout(1), gK, sK
+                    )
+                    tma_load_K_fn = copy_utils.tma_producer_copy_fn(tma_load_K_fn, pipeline_k)
+                    tma_load_V_fn, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_V, 0, cute.make_layout(1), gV, sV
+                    )
+                    tma_load_V_fn = copy_utils.tma_producer_copy_fn(tma_load_V_fn, pipeline_v)
+                else:
+                    # === cp_async path (paged KV with page_size != n_block_size) ===
+                    paged_kv_manager = PagedKVManager.create(
+                        mPageTable,
+                        mK,
+                        mV,
+                        FastDivmodDivisor(mK.shape[0]),
+                        batch_idx,
+                        head_idx_kv,
+                        tidx,
+                        seqlen.seqlen_k,
+                        0,  # leftpad_k
+                        self.tile_n,
+                        self.tile_hdim,
+                        self.tile_hdimv,
+                        self.num_threads_per_warp_group,
+                        mK.element_type,
+                        arch=self.arch.major * 10 + self.arch.minor,
+                    )
+                load_K = partial(
+                    self.load_KV,
+                    tma_load_K_fn,
+                    paged_kv_manager,
+                    sK,
+                    pipeline_kv=pipeline_k,
+                    K_or_V="K",
+                )
+                load_V = partial(
+                    self.load_KV,
+                    tma_load_V_fn,
+                    paged_kv_manager,
+                    sV,
+                    pipeline_kv=pipeline_v,
+                    K_or_V="V",
+                )
+                pack_gqa = None
+                if const_expr(not self.use_tma_Q):
+                    pack_gqa = PackGQA(
+                        self.tile_m, self.tile_hdim, self.check_hdim_oob, self.qhead_per_kvhead
+                    )
+                if const_expr(not self.use_block_sparsity):
+                    n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
+                    # if cute.arch.thread_idx()[0] == 0:
+                    #     cute.printf("m_block = %d, n_block_min: %d, n_block_max: %d", m_block, n_block_min, n_block_max)
+                    # Clamp n_block to 0 when n_block_max == 0 (can happen with causal
+                    # + pack_gqa when seqlen_k < tile_n). TMA handles n_block=-1
+                    # gracefully (fills zeros), but cp.async would crash on
+                    # out-of-bounds page table access.
+                    n_block = (
+                        n_block_max - 1
+                        if const_expr(self.use_tma_KV)
+                        else cutlass.max(n_block_max - 1, 0)
+                    )
+                    page_idx = (
+                        mPageTable[batch_idx, n_block]
+                        if const_expr(mPageTable is not None and self.use_tma_KV)
+                        else None
+                    )
+                    # First iteration: load K on pipeline_k, Q on pipeline_q
+                    if is_kv_load_warp:
+                        pipeline_k.producer_acquire(kv_producer_state)
+                        if const_expr(not self.use_tma_KV):
+                            paged_kv_manager.load_page_table(n_block)
+                        load_K(block=n_block, producer_state=kv_producer_state, page_idx=page_idx)
+                    if const_expr(self.use_tma_Q):
+                        if warp_idx_in_wg == 0:
+                            pipeline_q.producer_acquire_w_index_phase(0, q_producer_phase)
+                            load_Q(tma_bar_ptr=pipeline_q.sync_object_full.get_barrier(0))
+                            q_producer_phase ^= 1
+                    else:
+                        pipeline_q.producer_acquire_w_index_phase(0, q_producer_phase)
+                        pack_gqa.load_Q(
+                            mQ_cur, sQ, gmem_tiled_copy_Q, tidx, m_block, seqlen.seqlen_q
+                        )
+                        cute.arch.cp_async_commit_group()
+                        pipeline_q.producer_commit_w_index(0)
+                        q_producer_phase ^= 1
+                    if is_kv_load_warp:
+                        if const_expr(not self.intra_wg_overlap or not self.use_tma_KV):
+                            pipeline_v.producer_acquire(kv_producer_state)
+                            load_V(
+                                block=n_block, producer_state=kv_producer_state, page_idx=page_idx
+                            )
+                            kv_producer_state.advance()
+                            for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
+                                n_block = n_block_max - 1 - i - 1
+                                page_idx = (
+                                    mPageTable[batch_idx, n_block]
+                                    if const_expr(mPageTable is not None and self.use_tma_KV)
+                                    else None
+                                )
+                                if const_expr(not self.use_tma_KV):
+                                    paged_kv_manager.load_page_table(n_block)
+                                pipeline_k.producer_acquire(kv_producer_state)
+                                load_K(
+                                    block=n_block,
+                                    producer_state=kv_producer_state,
+                                    page_idx=page_idx,
+                                )
+                                pipeline_v.producer_acquire(kv_producer_state)
+                                load_V(
+                                    block=n_block,
+                                    producer_state=kv_producer_state,
+                                    page_idx=page_idx,
+                                )
+                                kv_producer_state.advance()
+                        else:
+                            for i in cutlass.range(n_block_max - 1 - n_block_min, unroll=1):
+                                n_block_prev = n_block_max - i - 1
+                                n_block = n_block_prev - 1
+                                page_idx = (
+                                    mPageTable[batch_idx, n_block]
+                                    if const_expr(mPageTable is not None)
+                                    else None
+                                )
+                                page_idx_prev = (
+                                    mPageTable[batch_idx, n_block_prev]
+                                    if const_expr(mPageTable is not None)
+                                    else None
+                                )
+                                kv_producer_state_prev = kv_producer_state.clone()
+                                kv_producer_state.advance()
+                                pipeline_k.producer_acquire(kv_producer_state)
+                                load_K(
+                                    block=n_block,
+                                    producer_state=kv_producer_state,
+                                    page_idx=page_idx,
+                                )
+                                pipeline_v.producer_acquire(kv_producer_state_prev)
+                                load_V(
+                                    block=n_block_prev,
+                                    producer_state=kv_producer_state_prev,
+                                    page_idx=page_idx_prev,
+                                )
+                            n_block = n_block_min
+                            page_idx = (
+                                mPageTable[batch_idx, n_block]
+                                if const_expr(mPageTable is not None)
+                                else None
+                            )
+                            pipeline_v.producer_acquire(kv_producer_state)
+                            load_V(
+                                block=n_block, producer_state=kv_producer_state, page_idx=page_idx
+                            )
+                            kv_producer_state.advance()
+                else:
+                    # Block sparsity: use TMA closures directly (not paged)
+                    # Load Q on pipeline_q, separate from K/V pipeline
+                    if const_expr(self.use_tma_Q):
+                        if warp_idx_in_wg == 0:
+                            pipeline_q.producer_acquire_w_index_phase(0, q_producer_phase)
+                            load_Q(tma_bar_ptr=pipeline_q.sync_object_full.get_barrier(0))
+                            q_producer_phase ^= 1
+                    else:
+                        pipeline_q.producer_acquire_w_index_phase(0, q_producer_phase)
+                        pack_gqa.load_Q(
+                            mQ_cur, sQ, gmem_tiled_copy_Q, tidx, m_block, seqlen.seqlen_q
+                        )
+                        cute.arch.cp_async_commit_group()
+                        pipeline_q.producer_commit_w_index(0)
+                        q_producer_phase ^= 1
+                    if is_kv_load_warp:
+                        kv_producer_state = produce_block_sparse_loads(
+                            blocksparse_tensors,
+                            batch_idx,
+                            head_idx,
+                            m_block,
+                            kv_producer_state,
+                            tma_load_K_fn,
+                            tma_load_V_fn,
+                            pipeline_k,
+                            pipeline_v,
+                            self.intra_wg_overlap,
+                            self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+                            self.q_subtile_factor if self.q_subtile_factor is not None else 1,
+                        )
+                tile_scheduler.prefetch_next_work()
+                tile_scheduler.advance_to_next_work()
+                work_tile = tile_scheduler.get_current_work()
+                # End of persistent scheduler loop
+            # Producer tail is only useful for cluster to avoid early exit of blocks.
+            # We only need producer_tail on V since that's the last that's loaded, we don't
+            # need it for Q (no cluster) and K.
+            if is_kv_load_warp:
+                pipeline_v.producer_tail(kv_producer_state)
+    @cute.jit
+    def load_KV(
+        self,
+        tma_load_fn: Optional[Callable],
+        paged_kv_manager: Optional[PagedKVManager],
+        sX: cute.Tensor,
+        block: Int32,
+        pipeline_kv: pipeline.PipelineAsync,
+        producer_state: pipeline.PipelineState,
+        K_or_V: Literal["K", "V"],
+        page_idx: Optional[Int32] = None,
+    ):
+        if const_expr(self.use_tma_KV):
+            src_idx = block if const_expr(page_idx is None) else page_idx
+            tma_load_fn(src_idx=src_idx, producer_state=producer_state)
+        else:
+            paged_kv_manager.load_KV(block, sX[None, None, producer_state.index], K_or_V)
+            cute.arch.cp_async_commit_group()
+        pipeline_kv.producer_commit(producer_state)
+    @cute.jit
+    def mma(
+        self,
+        tiled_mma_qk: cute.TiledMma,
+        tiled_mma_pv: cute.TiledMma,
+        mO: cute.Tensor,
+        mLSE: Optional[cute.Tensor],
+        sQ: cute.Tensor,
+        sK: cute.Tensor,
+        sVt: cute.Tensor,
+        sP: Optional[cute.Tensor],
+        sO: cute.Tensor,
+        learnable_sink: Optional[cute.Tensor],
+        pipeline_k: pipeline.PipelineAsync,
+        pipeline_v: pipeline.PipelineAsync,
+        pipeline_q: pipeline.PipelineAsync,
+        gmem_tiled_copy_O: cute.TiledCopy,
+        tma_atom_O: Optional[cute.CopyAtom],
+        tidx: Int32,
+        softmax_scale_log2: Float32,
+        softmax_scale: Optional[Float32],
+        block_info: BlockInfo,
+        SeqlenInfoCls: Callable,
+        AttentionMaskCls: Callable,
+        TileSchedulerCls: Callable,
+        blocksparse_tensors: Optional[BlockSparseTensors],
+        aux_tensors: Optional[list],
+        fastdiv_mods=None,
+    ):
+        warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
+        warp_group_thread_layout = cute.make_layout(
+            self.num_wg_mma, stride=self.num_threads_per_warp_group
+        )
+        thr_mma_qk = tiled_mma_qk.get_slice(tidx)
+        wg_mma_qk = tiled_mma_qk.get_slice(warp_group_thread_layout(warp_group_idx))
+        wg_mma_pv = tiled_mma_pv.get_slice(warp_group_thread_layout(warp_group_idx))
+        _, tSrQ, tSrK = sm90_utils.partition_fragment_ABC(
+            wg_mma_qk, (self.tile_m, self.tile_n, self.tile_hdim), sQ, sK
+        )
+        mma_qk_fn = partial(
+            sm90_utils.gemm_zero_init, tiled_mma_qk, (self.tile_m, self.tile_n), tSrQ, tSrK
+        )
+        acc_O, tOrP, tOrVt = sm90_utils.partition_fragment_ABC(
+            wg_mma_pv, (self.tile_m, self.tile_hdimv, self.tile_n), sP, sVt
+        )
+        mma_pv_fn = partial(sm90_utils.gemm_w_idx, tiled_mma_pv, acc_O, tOrP, tOrVt)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Smem copy atom tiling
+        # ///////////////////////////////////////////////////////////////////////////////
+        smem_copy_atom_P = utils.get_smem_store_atom(
+            self.arch.major * 10 + self.arch.minor, self.dtype
+        )
+        smem_thr_copy_P = cute.make_tiled_copy_C(smem_copy_atom_P, tiled_mma_qk).get_slice(tidx)
+        tPsP = smem_thr_copy_P.partition_D(sP) if const_expr(sP is not None) else None
+        smem_copy_params = SimpleNamespace(smem_thr_copy_P=smem_thr_copy_P, tPsP=tPsP)
+        self.mma_init()
+        q_consumer_phase = Int32(0)
+        kv_consumer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Consumer, self.num_stages
+        )
+        tile_scheduler = TileSchedulerCls()
+        work_tile = tile_scheduler.initial_work_tile_info()
+        softmax = Softmax.create(
+            softmax_scale_log2,
+            num_rows=acc_O.shape[0][0] * acc_O.shape[1],
+            softmax_scale=softmax_scale,
+        )
+        # For RescaleOBeforeGemm: persistent scores_scale across iterations
+        scores_scale = None
+        if const_expr(self.rescale_O_before_gemm):
+            scores_scale = cute.make_rmem_tensor_like(softmax.row_max, Float32)
+        mma_one_n_block_all = partial(
+            self.mma_one_n_block_intrawg_overlap
+            if const_expr(self.intra_wg_overlap)
+            else self.mma_one_n_block,
+            mma_qk_fn=mma_qk_fn,
+            pipeline_k=pipeline_k,
+            pipeline_v=pipeline_v,
+            acc_O=acc_O,
+            tOrP=tOrP,
+            smem_copy_params=smem_copy_params,
+            check_inf=True,
+            scores_scale=scores_scale,
+        )
+        process_first_half_block = partial(
+            self.first_half_block_overlap,
+            mma_qk_fn=mma_qk_fn,
+            pipeline_k=pipeline_k,
+            tOrP=tOrP,
+            smem_copy_params=smem_copy_params,
+            scores_scale=scores_scale,
+            softmax=softmax,
+            acc_O=acc_O,
+        )
+        process_last_half_block = partial(
+            self.last_half_block_overlap,
+            pipeline_v=pipeline_v,
+            mma_pv_fn=mma_pv_fn,
+            scores_scale=scores_scale,
+            softmax=softmax,
+            acc_O=acc_O,
+        )
+        while work_tile.is_valid_tile:
+            # if work_tile.is_valid_tile:
+            # shape: (atom_v_m * rest_m)
+            m_block, head_idx, batch_idx, _ = work_tile.tile_idx
+            seqlen = SeqlenInfoCls(batch_idx)
+            # Recompute fastdiv_mods if necessary for varlen with aux_tensors
+            recompute_fastdiv_mods_q = cutlass.const_expr(
+                aux_tensors is not None and (seqlen.has_cu_seqlens_q or seqlen.has_seqused_q)
+            )
+            recompute_fastdiv_mods_k = cutlass.const_expr(
+                aux_tensors is not None and (seqlen.has_cu_seqlens_k or seqlen.has_seqused_k)
+            )
+            if cutlass.const_expr(fastdiv_mods is not None):
+                seqlen_q_divmod, seqlen_k_divmod = fastdiv_mods
+                fastdiv_mods = (
+                    seqlen_q_divmod
+                    if not recompute_fastdiv_mods_q
+                    else FastDivmodDivisor(seqlen.seqlen_q),
+                    seqlen_k_divmod
+                    if not recompute_fastdiv_mods_k
+                    else FastDivmodDivisor(seqlen.seqlen_k),
+                )
+            mask = AttentionMaskCls(seqlen)
+            mask_fn = partial(
+                mask.apply_mask,
+                batch_idx=batch_idx,
+                head_idx=head_idx,
+                m_block=m_block,
+                thr_mma=thr_mma_qk,
+                mask_causal=self.is_causal,
+                mask_local=self.is_local,
+                aux_tensors=aux_tensors,
+                fastdiv_mods=fastdiv_mods,
+            )
+            score_mod_fn = None
+            if const_expr(self.score_mod is not None):
+                score_mod_fn = partial(
+                    self.apply_score_mod,
+                    thr_mma_qk,
+                    batch_idx,
+                    head_idx,
+                    m_block,
+                    softmax_scale=softmax_scale,
+                    aux_tensors=aux_tensors,
+                    fastdiv_mods=fastdiv_mods,
+                )
+            mma_one_n_block = partial(
+                mma_one_n_block_all, seqlen=seqlen, softmax=softmax, score_mod_fn=score_mod_fn
+            )
+            n_block_min, n_block_max = block_info.get_n_block_min_max(seqlen, m_block)
+            pipeline_q.consumer_wait_w_index_phase(0, q_consumer_phase)
+            # For performance reason, we separate out two kinds of iterations:
+            # those that need masking on S, and those that don't.
+            # We need masking on S for the very last block when K and V has length not multiple of tile_n.
+            # We also need masking on S if it's causal, for the last several blocks.
+            # softmax.reset()  # Don't need reset as we explicitly call softmax w is_first=True
+            O_should_accumulate = False
+            # ==========================================
+            # MAINLOOP
+            # ==========================================
+            if const_expr(not self.use_block_sparsity):
+                # ==========================================
+                # No block-sparsity (original path)
+                # ==========================================
+                # First iteration with seqlen masking
+                if const_expr(self.intra_wg_overlap):
+                    kv_consumer_state = process_first_half_block(
+                        n_block=n_block_max - 1,
+                        seqlen=seqlen,
+                        kv_consumer_state=kv_consumer_state,
+                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod),
+                        score_mod_fn=score_mod_fn,
+                        is_first_block=True,
+                    )
+                else:
+                    self.warp_scheduler_barrier_sync()
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=n_block_max - 1,
+                        seqlen=seqlen,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=True),
+                        is_first_n_block=True,
+                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=True),
+                    )
+                    O_should_accumulate = True
+                # if cute.arch.thread_idx()[0] == 128: cute.printf("m_block = {}, n_block_max = {}, n_block_min = {}", m_block, n_block_max, n_block_min)
+                n_block_max -= 1
+                # Next couple of iterations with causal masking
+                if const_expr(self.is_causal or self.is_local):
+                    n_block_min_causal_local_mask = block_info.get_n_block_min_causal_local_mask(
+                        seqlen, m_block, n_block_min
+                    )
+                    # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block_min_causal_local_mask = {}", n_block_min_causal_local_mask)
+                    for n_tile in cutlass.range(
+                        n_block_max - n_block_min_causal_local_mask, unroll=1
+                    ):
+                        kv_consumer_state = mma_one_n_block(
+                            kv_consumer_state,
+                            n_block=n_block_max - 1 - n_tile,
+                            seqlen=seqlen,
+                            mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                            mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
+                        )
+                        O_should_accumulate = True
+                    n_block_max = cutlass.min(n_block_max, n_block_min_causal_local_mask)
+                # The remaining iterations have no masking
+                n_block_min_before_local_mask = block_info.get_n_block_min_before_local_mask(
+                    seqlen, m_block, n_block_min
+                )
+                # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block_min_before_local_mask = {}, n_block_min = {}", n_block_min_before_local_mask, n_block_min)
+                for n_tile in cutlass.range(n_block_max - n_block_min_before_local_mask, unroll=1):
+                    kv_consumer_state = mma_one_n_block(
+                        kv_consumer_state,
+                        n_block=n_block_max - 1 - n_tile,
+                        seqlen=seqlen,
+                        mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                        mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
+                    )
+                    O_should_accumulate = True
+                # Separate iterations with local masking on the left
+                if const_expr(self.is_local and block_info.window_size_left is not None):
+                    n_block_max = cutlass.min(n_block_max, n_block_min_before_local_mask)
+                    for n_tile in cutlass.range(n_block_max - n_block_min, unroll=1):
+                        kv_consumer_state = mma_one_n_block(
+                            kv_consumer_state,
+                            n_block=n_block_max - 1 - n_tile,
+                            seqlen=seqlen,
+                            mma_pv_fn=partial(mma_pv_fn, zero_init=not O_should_accumulate),
+                            mask_fn=partial(mask_fn, mask_mod=self.mask_mod, mask_seqlen=False),
+                        )
+                        O_should_accumulate = True
+                # Release Q pipeline so the producer can load the next tile's Q
+                pipeline_q.consumer_release_w_index(0)
+                # Last "half" iteration
+                if const_expr(self.intra_wg_overlap):
+                    kv_consumer_state = process_last_half_block(
+                        kv_consumer_state=kv_consumer_state,
+                        zero_init=not O_should_accumulate,
+                    )
+                    O_should_accumulate = True
+                else:
+                    self.warp_scheduler_barrier_arrive()
+            else:
+                # ==========================================
+                # Block sparsity
+                # ==========================================
+                kv_consumer_state, O_should_accumulate, processed_any = consume_block_sparse_loads(
+                    blocksparse_tensors,
+                    batch_idx,
+                    head_idx,
+                    m_block,
+                    seqlen,
+                    kv_consumer_state,
+                    mma_pv_fn,
+                    mma_one_n_block,
+                    process_first_half_block,
+                    process_last_half_block,
+                    mask_fn,
+                    score_mod_fn,
+                    O_should_accumulate,
+                    self.mask_mod,
+                    fastdiv_mods,
+                    self.intra_wg_overlap,
+                    self.warp_scheduler_barrier_sync,
+                    self.warp_scheduler_barrier_arrive,
+                    self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+                    self.q_subtile_factor if self.q_subtile_factor is not None else 1,
+                )
+                # Release Q pipeline so the producer can load the next tile's Q
+                pipeline_q.consumer_release_w_index(0)
+                # Handle empty case (when no blocks to process)
+                if not processed_any:
+                    softmax.reset()
+                    acc_O.fill(0.0)
+            q_consumer_phase ^= 1
+            sink_val = None
+            if const_expr(learnable_sink is not None):
+                if const_expr(not self.pack_gqa):
+                    sink_val = Float32(learnable_sink[head_idx])
+                else:  # Each thread might have a different sink value due to different q_head
+                    sink_val = cute.make_rmem_tensor_like(softmax.row_max, Float32)
+                    cS = cute.make_identity_tensor((self.tile_m, self.tile_n))
+                    tScS_mn = layout_utils.reshape_acc_to_mn(thr_mma_qk.partition_C(cS))
+                    for r in cutlass.range(cute.size(sink_val), unroll_full=True):
+                        row = m_block * self.tile_m + tScS_mn[r][0]
+                        q_head_idx = row % self.qhead_per_kvhead + head_idx * self.qhead_per_kvhead
+                        sink_val[r] = Float32(learnable_sink[q_head_idx])
+            # normalize acc_O by row_sum and calculate the lse
+            row_scale = softmax.finalize(sink_val=sink_val)
+            softmax.rescale_O(acc_O, row_scale)
+            # ///////////////////////////////////////////////////////////////////////////////
+            # Epilogue
+            # ///////////////////////////////////////////////////////////////////////////////
+            self.epilogue(
+                acc_O,
+                softmax.row_sum,
+                mO,
+                mLSE,
+                sO,
+                seqlen,
+                gmem_tiled_copy_O,
+                tma_atom_O,
+                tiled_mma_pv,
+                tidx,
+                m_block,
+                head_idx,
+                batch_idx,
+            )
+            tile_scheduler.advance_to_next_work()
+            work_tile = tile_scheduler.get_current_work()
+    @cute.jit
+    def first_half_block_overlap(
+        self,
+        n_block: Int32,
+        mma_qk_fn: Callable,
+        kv_consumer_state,
+        pipeline_k,
+        tOrP: cute.Tensor,
+        smem_copy_params: SimpleNamespace,
+        softmax: Softmax,
+        seqlen: SeqlenInfoQK,
+        scores_scale: Optional[cute.Tensor] = None,
+        acc_O: Optional[cute.Tensor] = None,
+        mask_fn: Callable = None,
+        score_mod_fn: Optional[Callable] = None,
+        is_first_block: bool = False,
+    ):
+        """Processes the first half block when using intra-warpgroup-overlap"""
+        pipeline_k.consumer_wait(kv_consumer_state, pipeline_k.consumer_try_wait(kv_consumer_state))
+        acc_S = mma_qk_fn(B_idx=kv_consumer_state.index, wg_wait=0)
+        pipeline_k.consumer_release(kv_consumer_state)
+        # Apply score modification if present
+        if const_expr(score_mod_fn is not None):
+            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
+        # Apply mask; mask_seqlen always True for first block
+        # Caveat: if full block further right than mask block, seqlen masking is redundant;
+        # however, masking is being applied anyway, so essentially no perf hit
+        mask_fn(acc_S, n_block=n_block, mask_seqlen=True)
+        row_scale = softmax.online_softmax(acc_S, is_first=is_first_block)
+        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
+        tOrP_cur = (
+            tOrP
+            if const_expr(self.mma_pv_is_rs)
+            else cute.make_rmem_tensor_like(tOrP_acc, self.dtype)
+        )
+        tOrP_cur.store(tOrP_acc.load().to(self.dtype))
+        if const_expr(not self.mma_pv_is_rs):
+            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
+            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
+            # Fence and barrier to make smem store visible to WGMMA
+            cute.arch.fence_view_async_shared()
+            cute.arch.sync_warp()
+        # For RescaleOBeforeGemm: initialize acc_O
+        if const_expr(self.rescale_O_before_gemm):
+            acc_O.fill(0.0)
+            scores_scale.store(row_scale.load())
+        return kv_consumer_state
+    @cute.jit
+    def last_half_block_overlap(
+        self,
+        kv_consumer_state,
+        pipeline_v,
+        mma_pv_fn: Callable,
+        zero_init: bool,
+        scores_scale: Optional[cute.Tensor] = None,
+        softmax: Optional[Softmax] = None,
+        acc_O: Optional[cute.Tensor] = None,
+    ):
+        """Processes the final PV GEMM when using intra-warpgroup-overlap"""
+        # For RescaleOBeforeGemm: rescale O before the final PV GEMM
+        if const_expr(self.rescale_O_before_gemm):
+            softmax.rescale_O(acc_O, scores_scale)
+        pipeline_v.consumer_wait(kv_consumer_state, pipeline_v.consumer_try_wait(kv_consumer_state))
+        mma_pv_fn(B_idx=kv_consumer_state.index, zero_init=zero_init, wg_wait=0)
+        pipeline_v.consumer_release(kv_consumer_state)
+        kv_consumer_state.advance()
+        return kv_consumer_state
+    @cute.jit
+    def mma_one_n_block(
+        self,
+        smem_pipe_read: pipeline.PipelineState | pipeline_custom.PipelineStateSimple,
+        n_block: Int32,
+        mma_qk_fn: Callable,
+        mma_pv_fn: Callable,
+        pipeline_k: pipeline.PipelineAsync,
+        pipeline_v: pipeline.PipelineAsync,
+        acc_O: cute.Tensor,
+        tOrP: cute.Tensor,
+        smem_copy_params: SimpleNamespace,
+        softmax: Softmax,
+        seqlen: SeqlenInfoQK,
+        scores_scale: Optional[cute.Tensor] = None,  # not used
+        score_mod_fn: Optional[Callable] = None,
+        mask_fn: Optional[Callable] = None,
+        is_first_n_block: cutlass.Constexpr = False,
+        check_inf: cutlass.Constexpr = True,
+    ):
+        pipeline_k.consumer_wait(smem_pipe_read, pipeline_k.consumer_try_wait(smem_pipe_read))
+        # S = Q @ K.T
+        acc_S = mma_qk_fn(B_idx=smem_pipe_read.index, wg_wait=-1)
+        self.warp_scheduler_barrier_arrive()
+        warpgroup.wait_group(0)
+        pipeline_k.consumer_release(smem_pipe_read)
+        # handle score mods and masking
+        if const_expr(score_mod_fn is not None):
+            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
+        if const_expr(mask_fn is not None):
+            mask_fn(acc_S=acc_S, n_block=n_block)
+        row_scale = softmax.online_softmax(acc_S, is_first=is_first_n_block, check_inf=check_inf)
+        # if cute.arch.thread_idx()[0] == 0: cute.print_tensor(layout_utils.reshape_acc_to_mn(acc_S))
+        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
+        tOrP_cur = (
+            tOrP
+            if const_expr(self.mma_pv_is_rs)
+            else cute.make_rmem_tensor_like(tOrP_acc, self.dtype)
+        )
+        # tOrP.store(tOrP_acc.load().to(self.dtype))
+        # the "to(self.dtype)" conversion fails to vectorize for block sizes other
+        # than 128 x 128, i.e. it calls convert on 1 fp32 element at a time instead of
+        # 2 elements. So we just call ptx directly.
+        utils.cvt_f16(tOrP_acc, tOrP_cur)
+        if const_expr(not self.mma_pv_is_rs):
+            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
+            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
+        softmax.rescale_O(acc_O, row_scale)
+        if const_expr(not self.mma_pv_is_rs):
+            # Fence and barrier to make sure smem store is visible to WGMMA
+            cute.arch.fence_view_async_shared()
+            cute.arch.sync_warp()  # Only need syncwarp since each warp is using its own P values for MmaPV
+        pipeline_v.consumer_wait(smem_pipe_read, pipeline_v.consumer_try_wait(smem_pipe_read))
+        self.warp_scheduler_barrier_sync()
+        # O += P @ V
+        mma_pv_fn(B_idx=smem_pipe_read.index, wg_wait=0)
+        pipeline_v.consumer_release(smem_pipe_read)
+        smem_pipe_read.advance()
+        return smem_pipe_read
+    @cute.jit
+    def mma_one_n_block_intrawg_overlap(
+        self,
+        smem_pipe_read: pipeline.PipelineState | pipeline_custom.PipelineStateSimple,
+        n_block: Int32,
+        mma_qk_fn: Callable,
+        mma_pv_fn: Callable,
+        pipeline_k: pipeline.PipelineAsync,
+        pipeline_v: pipeline.PipelineAsync,
+        acc_O: cute.Tensor,
+        tOrP: cute.Tensor,
+        smem_copy_params: SimpleNamespace,
+        softmax: Softmax,
+        seqlen: SeqlenInfoQK,
+        scores_scale: Optional[cute.Tensor] = None,
+        score_mod_fn: Optional[Callable] = None,
+        mask_fn: Optional[Callable] = None,
+        check_inf: cutlass.Constexpr = True,
+    ):
+        smem_pipe_read_v = smem_pipe_read.clone()
+        smem_pipe_read.advance()
+        pipeline_k.consumer_wait(smem_pipe_read, pipeline_k.consumer_try_wait(smem_pipe_read))
+        self.warp_scheduler_barrier_sync()
+        # S = Q @ K.T
+        acc_S = mma_qk_fn(B_idx=smem_pipe_read.index, wg_wait=-1)
+        # RescaleOBeforeGemm: rescale O while QK GEMM is in flight, before PV GEMM
+        if const_expr(self.rescale_O_before_gemm):
+            softmax.rescale_O(acc_O, scores_scale)
+        pipeline_v.consumer_wait(smem_pipe_read_v, pipeline_v.consumer_try_wait(smem_pipe_read_v))
+        # O += P @ V
+        mma_pv_fn(B_idx=smem_pipe_read_v.index, wg_wait=-1)
+        self.warp_scheduler_barrier_arrive()
+        warpgroup.wait_group(1)
+        pipeline_k.consumer_release(smem_pipe_read)
+        # handle score mods and masking
+        if const_expr(score_mod_fn is not None):
+            score_mod_fn(acc_S, n_block=n_block, seqlen=seqlen)
+        if const_expr(mask_fn is not None):
+            mask_fn(acc_S=acc_S, n_block=n_block)
+        # if cute.arch.thread_idx()[0] == 128: cute.print_tensor(layout_utils.reshape_acc_to_mn(acc_S))
+        row_scale = softmax.online_softmax(acc_S, check_inf=check_inf)
+        warpgroup.wait_group(0)
+        pipeline_v.consumer_release(smem_pipe_read_v)
+        tOrP_acc = layout_utils.reshape_acc_to_frgA(acc_S)
+        tOrP_cur = (
+            tOrP
+            if const_expr(self.mma_pv_is_rs)
+            else cute.make_rmem_tensor_like(tOrP_acc, self.dtype)
+        )
+        # tOrP_cur.store(tOrP_acc.load().to(self.dtype))
+        # the "to(self.dtype)" conversion fails to vectorize for block sizes other
+        # than 128 x 128, i.e. it calls convert on 1 fp32 element at a time instead of
+        # 2 elements. So we just call ptx directly.
+        utils.cvt_f16(tOrP_acc, tOrP_cur)
+        if const_expr(not self.mma_pv_is_rs):
+            tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP_cur)
+            cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
+        if const_expr(not self.rescale_O_before_gemm):
+            softmax.rescale_O(acc_O, row_scale)
+        if const_expr(self.rescale_O_before_gemm):
+            scores_scale.store(row_scale.load())
+        if const_expr(not self.mma_pv_is_rs):
+            # Fence and barrier to make sure smem store is visible to WGMMA
+            cute.arch.fence_view_async_shared()
+            cute.arch.sync_warp()  # Only need syncwarp since each warp is using its own P values for MmaPV
+        return smem_pipe_read
+    @cute.jit
+    def mma_init(self):
+        warp_group_idx = utils.canonical_warp_group_idx(sync=False)
+        if const_expr(self.use_scheduler_barrier):
+            if warp_group_idx == 1:
+                cute.arch.barrier_arrive(
+                    barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1),
+                    number_of_threads=2 * self.num_threads_per_warp_group,
+                )
+    @cute.jit
+    def apply_score_mod(
+        self,
+        thr_mma_qk,
+        batch_idx,
+        head_idx,
+        m_block,
+        acc_S,
+        n_block,
+        softmax_scale,
+        seqlen,
+        aux_tensors: Optional[list] = None,
+        fastdiv_mods=None,
+    ):
+        # Prepare index tensor
+        cS = cute.make_identity_tensor((self.tile_m, self.tile_n))
+        cS = cute.domain_offset((m_block * self.tile_m, n_block * self.tile_n), cS)
+        tScS = thr_mma_qk.partition_C(cS)
+        apply_score_mod_inner(
+            acc_S,
+            tScS,
+            self.score_mod,
+            batch_idx,
+            head_idx,
+            softmax_scale,
+            self.vec_size,
+            self.qk_acc_dtype,
+            aux_tensors,
+            fastdiv_mods,
+            seqlen_info=seqlen,
+            constant_q_idx=None,
+            qhead_per_kvhead=self.qhead_per_kvhead if const_expr(self.pack_gqa) else 1,
+        )
+    def warp_scheduler_barrier_sync(self):
+        if const_expr(self.use_scheduler_barrier):
+            cute.arch.barrier(
+                barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1)
+                - 1
+                + utils.canonical_warp_group_idx(sync=False),
+                number_of_threads=2 * self.num_threads_per_warp_group,
+            )
+    def warp_scheduler_barrier_arrive(self):
+        if const_expr(self.use_scheduler_barrier):
+            assert self.num_wg_mma in [2, 3]
+            cur_wg = utils.canonical_warp_group_idx(sync=False) - 1
+            if const_expr(self.num_wg_mma == 2):
+                next_wg = 1 - cur_wg
+            else:
+                t = cur_wg + 1
+                next_wg = t % self.num_wg_mma
+            cute.arch.barrier_arrive(
+                barrier_id=int(NamedBarrierFwd.WarpSchedulerWG1) + next_wg,
+                number_of_threads=2 * self.num_threads_per_warp_group,
+            )

build/torch-cuda/interface.py CHANGED Viewed

@@ -21,6 +21,7 @@
 import os
 import math
 from functools import lru_cache
 from typing import Optional, Tuple, Callable
@@ -31,6 +32,8 @@ import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 from .cache_utils import get_jit_cache
 from .testing import is_fake_mode
@@ -43,30 +46,201 @@ if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
 from . import utils
 from .cute_dsl_utils import (
     to_cute_tensor, to_cute_aux_tensor, get_aux_tensor_metadata, get_broadcast_dims,
 )
-from .flash_fwd import FlashAttentionForwardSm90
 from .flash_fwd_sm100 import FlashAttentionForwardSm100
 from .flash_bwd_preprocess import FlashAttentionBackwardPreprocess
 from .flash_bwd import FlashAttentionBackwardSm80
 from .flash_bwd_sm90 import FlashAttentionBackwardSm90
 from .flash_bwd_sm100 import FlashAttentionBackwardSm100
 from .flash_bwd_postprocess import FlashAttentionBackwardPostprocess
 from .flash_fwd_combine import FlashAttentionForwardCombine
 from .block_sparsity import (
     BlockSparseTensorsTorch,
     to_cute_block_sparse_tensors,
     normalize_block_sparse_config,
     normalize_block_sparse_config_bwd,
 )
 @lru_cache(maxsize=None)
 def _get_device_arch():
-    """Cached device arch check."""
     major, minor = torch.cuda.get_device_capability()
-    return major * 10 + minor
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
@@ -76,7 +250,8 @@ def _validate_tensor(t, name, expected_shape, expected_dtype, expected_device):
     assert t.shape == expected_shape, f"{name} shape {t.shape} != expected {expected_shape}"
     assert t.dtype == expected_dtype, f"{name} dtype {t.dtype} != expected {expected_dtype}"
     assert t.device == expected_device, f"{name} device {t.device} != expected {expected_device}"
-    assert t.is_cuda, f"{name} must be on CUDA"
 torch2cute_dtype_map = {
@@ -96,6 +271,29 @@ def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, max_splits):
     return min(num_SMs // total_mblocks, max_splits, num_n_blocks)
 def _flash_attn_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -113,11 +311,9 @@ def _flash_attn_fwd(
     window_size_left: Optional[int] = None,
     window_size_right: Optional[int] = None,
     learnable_sink: Optional[torch.Tensor] = None,
-    # m_block_size: int = 128,
-    # n_block_size: int = 64,
-    # num_threads: int = 128,
-    m_block_size: int = 128,
-    n_block_size: int = 128,
     num_threads: int = 384,
     num_splits: int = 1,
     pack_gqa: Optional[bool] = None,
@@ -138,7 +334,7 @@ def _flash_attn_fwd(
         mask_mod: A callable that takes token position information and selectively masks
         block_sparse_tensors: A tuple of tensors used for block sparsity.
         return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
-            Note: the returned LSE currently does not support taking gradient.
         out: Optional pre-allocated output tensor. If None, will be allocated internally.
         lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
         aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
@@ -203,25 +399,27 @@ def _flash_attn_fwd(
         assert learnable_sink.shape == (num_head,)
         assert learnable_sink.dtype == torch.bfloat16, "learnable_sink must be bfloat16"
-    assert all(
-        t is None or t.is_cuda
-        for t in (
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqused_q,
-            seqused_k,
-            page_table,
-            learnable_sink,
-        )
-    ), "inputs must be on CUDA device"
     assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
-    assert head_dim <= 256, "head_dim must be less than or equal to 256"
     alignment = 16 // q.element_size()
-    assert head_dim % alignment == 0, f"head_dim must be divisible by {alignment}"
-    assert head_dim_v % alignment == 0, f"head_dim_v must be divisible by {alignment}"
     if softmax_scale is None:
         softmax_scale = 1.0 / math.sqrt(head_dim)
     if softcap == 0.0:
@@ -253,43 +451,47 @@ def _flash_attn_fwd(
         _validate_tensor(lse, "lse", lse_shape, torch.float32, device)
     dtype = torch2cute_dtype_map[q.dtype]
-    arch = _get_device_arch() if _arch is None else _arch
-    assert arch // 10 in [9, 10, 11], "Unsupported compute capability. Supported: 9.x, 10.x, 11.x"
-    use_block_sparsity = block_sparse_tensors is not None
-    if mask_mod is None:
-        if causal:
-            window_size_right = 0
-        if window_size_left is not None and window_size_right is not None and window_size_left + window_size_right < 0:
-            window_size_left = None
-            window_size_right = None
-        local = window_size_left is not None or window_size_right is not None
-        if window_size_left is not None or window_size_right is not None:
-            if window_size_left is None and window_size_right == 0:
-                causal, local = True, False
-                window_size_right = None
             else:
-                causal, local = False, True
     else:
-        causal, local = False, False
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    if arch // 10 == 9:  # TODO: tune block size according to hdim.
-        if head_dim == head_dim_v == 128 and not causal and not local and not use_block_sparsity:
-            n_block_size = 192
-    if arch // 10 in [10, 11]:
-        if (
-            pack_gqa
-            and (128 % qhead_per_kvhead != 0)
-        ):
-            pack_gqa = False
-        # TODO: fix GQA + SplitKV + non-varlen
-        if pack_gqa and num_splits != 1 and cu_seqlens_q is None:
-            pack_gqa = False
     if max_seqlen_q is None:
         max_seqlen_q = seqlen_q if cu_seqlens_q is None else total_q
@@ -297,28 +499,50 @@ def _flash_attn_fwd(
         max_seqlen_k = seqlen_k
     seqlen_q_packgqa = max_seqlen_q * qhead_per_kvhead
     if arch // 10 == 10:
-        q_stage = 2 if seqlen_q_packgqa > m_block_size else 1
     else:
         q_stage = 1
     if num_splits < 1:
-        m_block_size_effective = q_stage * m_block_size
-        seqlen_k_loaded = max_seqlen_k if not local else max(0, min(max_seqlen_k, window_size_right + window_size_left + 1 + m_block_size))
-        num_n_blocks = (seqlen_k_loaded + n_block_size - 1) // n_block_size
-        num_m_blocks = (seqlen_q_packgqa + m_block_size_effective - 1) // m_block_size_effective
-        total_mblocks = batch_size * num_head_kv * num_m_blocks
-        num_splits = num_splits_heuristic(
-            total_mblocks,
-            torch.cuda.get_device_properties(device).multi_processor_count,
-            num_n_blocks,
-            128,
-        )
     is_split_kv = num_splits > 1
     if is_split_kv:
         out_partial = torch.empty(num_splits, *q_batch_seqlen_shape, num_head, head_dim_v, dtype=torch.float32, device=device)
         lse_partial = torch.empty(num_splits, *lse_shape, dtype=torch.float32, device=device)
     # hash score and mask mods for compile cache
     score_mod_hash = utils.hash_callable(score_mod) if score_mod is not None else False
     mask_mod_hash = utils.hash_callable(mask_mod) if mask_mod is not None else False
@@ -370,14 +594,14 @@ def _flash_attn_fwd(
             num_head=num_head,
             seqlen_q=seqlen_q,
             seqlen_k=seqlen_k,
-            block_size=(m_block_size, n_block_size),
             q_stage=q_stage,
         )
-    if aux_tensors is not None:
         aux_tensor_metadata = get_aux_tensor_metadata(aux_tensors)
     else:
         aux_tensor_metadata = None
     compile_key = (
         dtype,
         head_dim,
@@ -398,15 +622,20 @@ def _flash_attn_fwd(
         window_size_left is not None,
         window_size_right is not None,
         learnable_sink is not None,
-        m_block_size,
-        n_block_size,
         q_stage,
         num_threads,
         is_split_kv,
         pack_gqa,
         arch,
-        page_size not in [None, 128],  # paged KV non-TMA
         q_subtile_factor,
     )
     if compile_key not in _flash_attn_fwd.compile_cache:
         (
@@ -445,10 +674,28 @@ def _flash_attn_fwd(
         if aux_tensors is not None:
             cute_aux_tensors = [to_cute_aux_tensor(buf) for buf in aux_tensors]
-        if arch // 10 == 9:
-            assert page_table is None, "paged KV not supported on SM 9.0"
             assert not is_split_kv, "SplitKV not supported on SM 9.0"
-            # fa_fwd = FlashAttentionForwardSm80(
             fa_fwd = FlashAttentionForwardSm90(
                 dtype,
                 head_dim,
@@ -457,33 +704,21 @@ def _flash_attn_fwd(
                 is_causal=causal,
                 is_local=local,
                 pack_gqa=pack_gqa,
-                tile_m=m_block_size,
-                tile_n=n_block_size,
                 # num_stages=1,
                 num_stages=2,
                 num_threads=num_threads,
                 Q_in_regs=False,
-                intra_wg_overlap=True,
-                mma_pv_is_rs=True,
                 mask_mod=mask_mod,
                 score_mod=score_mod,
                 has_aux_tensors=aux_tensors is not None,
                 q_subtile_factor=q_subtile_factor,
             )
         elif arch // 10 in [10, 11]:
-            head_dim_padded = int(math.ceil(head_dim / 16) * 16)
-            head_dim_v_padded = int(math.ceil(head_dim / 16) * 16)
-            use_2cta_instrs = (
-                not causal
-                and not local
-                and not is_split_kv
-                and cu_seqlens_q is None
-                and seqused_q is None
-                and not use_block_sparsity
-                and page_size in [None, 128]
-                and head_dim_padded == 128
-                and head_dim_v_padded == 128
-            )
             fa_fwd = FlashAttentionForwardSm100(
                 head_dim,
                 head_dim_v,
@@ -492,8 +727,8 @@ def _flash_attn_fwd(
                 is_local=local,
                 is_split_kv=is_split_kv,
                 pack_gqa=pack_gqa,
-                m_block_size=m_block_size,
-                n_block_size=n_block_size,
                 q_stage=q_stage,
                 is_persistent=not causal
                     and not local
@@ -503,14 +738,37 @@ def _flash_attn_fwd(
                 score_mod=score_mod,
                 mask_mod=mask_mod,
                 has_aux_tensors=aux_tensors is not None,
-                paged_kv_non_tma=page_size not in [None, 128],
                 is_varlen_q=cu_seqlens_q is not None or seqused_q is not None,
                 q_subtile_factor=q_subtile_factor,
                 use_2cta_instrs=use_2cta_instrs,
             )
         else:
             raise ValueError(
-                f"Unsupported compute capability: {arch}. Supported: 9.x, 10.x, 11.x"
             )
         # TODO: check @can_implement
         _flash_attn_fwd.compile_cache[compile_key] = cute.compile(
@@ -521,7 +779,6 @@ def _flash_attn_fwd(
             o_tensor,
             lse_tensor,
             softmax_scale,
-            current_stream,
             cu_seqlens_q_tensor,
             cu_seqlens_k_tensor,
             seqused_q_tensor,
@@ -532,6 +789,7 @@ def _flash_attn_fwd(
             learnable_sink_tensor,
             sparse_tensors,
             cute_aux_tensors,
             options="--enable-tvm-ffi",
         )
@@ -547,7 +805,6 @@ def _flash_attn_fwd(
             out.detach() if not is_split_kv else out_partial,
             lse_partial if is_split_kv else lse,
             softmax_scale,
-            current_stream,
             cu_seqlens_q,
             cu_seqlens_k,
             seqused_q,
@@ -574,6 +831,140 @@ def _flash_attn_fwd(
 _flash_attn_fwd.compile_cache = get_jit_cache("fwd")
 def _flash_attn_bwd(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -614,47 +1005,74 @@ def _flash_attn_bwd(
     mask_mod: Optional[Callable] = None,
     aux_tensors: Optional[list[torch.Tensor]] = None,
     block_sparse_tensors: Optional[BlockSparseTensorsTorch] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     arch = _get_device_arch()
-    assert arch // 10 in [9, 10, 11], "Unsupported compute capability. Supported: 9.x, 10.x, 11.x"
     num_head, head_dim = q.shape[-2:]
-    if causal:
-        window_size_right = 0
-    if window_size_left is not None and window_size_right is not None and window_size_left + window_size_right < 0:
-        window_size_left = None
-        window_size_right = None
-    local = window_size_left is not None or window_size_right is not None
-    if local:
-        if window_size_left is None and window_size_right == 0:
-            causal, local = True, False
-            window_size_right = None
-        else:
-            causal, local = False, True
-    if arch // 10 == 9:
-        m_block_size = 80 if not causal else 64
-        n_block_size = 128
-        num_stages_Q = 2
-        num_stages_dO = 2
-        num_stages_PdS = 2
-        SdP_swapAB = True
         dKV_swapAB = False
-        dQ_swapAB = not causal
-        AtomLayoutMSdP = 1
-        AtomLayoutNdKV = 2
-        AtomLayoutMdQ = 1
         cluster_size = 1
         use_2cta_instrs = False
-        assert window_size_left is None and window_size_right is None, "local not supported yet on 9.x"
         is_varlen = (
             cu_seqlens_q is not None
             or cu_seqlens_k is not None
             or seqused_q is not None
             or seqused_k is not None
         )
-        assert not is_varlen, "varlen backward is not yet supported on sm90"
     else:
         m_block_size = 128
         n_block_size = 128
@@ -662,15 +1080,17 @@ def _flash_attn_bwd(
         dKV_swapAB = False
         AtomLayoutMdQ = 1
         AtomLayoutNdKV = 1
         disable_2cta = (
-            local
             or score_mod is not None
             or score_mod_bwd is not None
             or mask_mod is not None
         )
         cluster_size = 2 if head_dim >= 128 and not disable_2cta else 1
         use_2cta_instrs = cluster_size==2
     q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = [
         maybe_contiguous(t)
         for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
@@ -692,19 +1112,9 @@ def _flash_attn_bwd(
         seqlen_k = max_seqlen_k if max_seqlen_k is not None else total_k
     num_head_kv = k.shape[-2]
-    head_dim_v = v.shape[-1]
     use_block_sparsity = block_sparse_tensors is not None
-    # SM90 block-sparse backward: tile_m=64 is the GCD between a m_block_size that fits,
-    # the base block_m of 128 from forward, and block-sparse size for subtiling.
-    if arch // 10 == 9 and use_block_sparsity:
-        m_block_size = 64
-        # dQ_swapAB tuning: use False when m_block_size=64 (same as causal case)
-        dQ_swapAB = False
-    # NB: this could be derived from the block_sparse_tensors but for now we hardcode it to 2
-    subtile_factor = 2
     seqlen_q_rounded = (seqlen_q + m_block_size - 1) // m_block_size * m_block_size
     seqlen_k_rounded = (seqlen_k + n_block_size - 1) // n_block_size * n_block_size
     num_n_blocks = seqlen_k_rounded // n_block_size
@@ -744,14 +1154,16 @@ def _flash_attn_bwd(
         if t is not None:
             assert t.dtype == torch.int32, "cu_seqlens_q, cu_seqlens_k must be int32"
     assert lse.dtype == torch.float32, "lse must be float32"
-    assert all(
-        t is None or t.is_cuda for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k)
-    ), "inputs must be on CUDA device"
     assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
-    assert head_dim <= 256, "head_dim must be less than or equal to 256"
     alignment = 16 // q.element_size()
-    assert head_dim % alignment == 0, f"head_dim must be divisible by {alignment}"
-    assert head_dim_v % alignment == 0, f"head_dim_v must be divisible by {alignment}"
     if softmax_scale is None:
         softmax_scale = 1.0 / math.sqrt(head_dim)
     qhead_per_kvhead = num_head // num_head_kv
@@ -759,9 +1171,6 @@ def _flash_attn_bwd(
         pack_gqa = qhead_per_kvhead > 1
     # pack_gqa backward not yet supported in bwd
     pack_gqa = False
-    if arch // 10 not in [10, 11]:
-        assert deterministic is False, "bwd deterministic only supported for sm100/sm110 for now"
     if score_mod is not None:
         assert score_mod_bwd is not None, "score_mod_bwd is required when score_mod is provided"
         assert softcap == 0.0, "softcap and score_mod are mutually exclusive (different log2 scaling)"
@@ -813,6 +1222,9 @@ def _flash_attn_bwd(
         dpsum = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
         lse_log2 = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
     dKV_postprocess = qhead_per_kvhead > 1
     if dKV_postprocess:
         head_dim_v_rounded = (head_dim_v + 32 - 1) // 32 * 32
@@ -850,83 +1262,30 @@ def _flash_attn_bwd(
             )
     dtype = torch2cute_dtype_map[q.dtype]
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     if deterministic:
-        dQ_semaphore = torch.zeros(batch_size, num_head, seqlen_q_rounded // m_block_size, cluster_size, dtype=torch.int32, device="cuda")
     else:
         dQ_semaphore = None
     if deterministic and qhead_per_kvhead > 1:
-        dK_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device="cuda")
-        dV_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device="cuda")
     else:
         dK_semaphore = None
         dV_semaphore = None
-    # Preprocess kernel: compute (o * dout).sum(dim=-1), lse * log2_e, and zero out dq_accum.
-    compile_key_pre = (
-        arch,
-        dtype,
-        head_dim,
-        head_dim_v,
-        m_block_size,
-        num_threads,
-        cu_seqlens_q is None,
-        seqused_q is None,
-        get_broadcast_dims(out),
-        get_broadcast_dims(dout),
     )
-    if compile_key_pre not in _flash_attn_bwd.compile_cache_pre:
-        o_tensor, do_tensor = [to_cute_tensor(t) for t in (out, dout)]
-        dq_accum_tensor, dpsum_tensor, lse_log2_tensor = [
-            to_cute_tensor(t) for t in (dq_accum, dpsum, lse_log2)
-        ]
-        lse_tensor = to_cute_tensor(lse, assumed_align=4)
-        cu_seqlens_q_tensor, seqused_q_tensor = [
-            to_cute_tensor(t, assumed_align=4) if t is not None else None
-            for t in (cu_seqlens_q, seqused_q)
-        ]
-        fa_bwd_pre = FlashAttentionBackwardPreprocess(
-            dtype,
-            head_dim,
-            head_dim_v,
-            arch,
-            m_block_size,
-            num_threads=num_threads,
-        )
-        # TODO: check @can_implement
-        _flash_attn_bwd.compile_cache_pre[compile_key_pre] = cute.compile(
-            fa_bwd_pre,
-            o_tensor,
-            do_tensor,
-            dpsum_tensor,
-            lse_tensor,
-            lse_log2_tensor,
-            dq_accum_tensor,
-            cu_seqlens_q_tensor,
-            seqused_q_tensor,
-            current_stream,
-            options="--enable-tvm-ffi",
-        )
-    if not is_fake_mode():
-        _flash_attn_bwd.compile_cache_pre[compile_key_pre](
-            out,
-            dout,
-            dpsum,
-            lse,
-            lse_log2,
-            dq_accum,
-            cu_seqlens_q,
-            seqused_q,
-            current_stream,
-        )
-    # NB num_threads application for 3 kernels
-    # There are pre, main, post processing kernels, currenlty num_threads is only actually
-    # used for the pre proc, and then we hard code to 384 for the main and post proc, and we do
-    # before cache key gen
-    num_threads = 384
     # Backward kernel: compute dk, dv, dq_accum.
     score_mod_hash = utils.hash_callable(score_mod) if score_mod else False
@@ -953,7 +1312,7 @@ def _flash_attn_bwd(
             subtile_factor=subtile_factor,
         )
-    if arch // 10 == 9:
         compile_key = (
             arch,
             dtype,
@@ -961,6 +1320,8 @@ def _flash_attn_bwd(
             head_dim_v,
             qhead_per_kvhead,
             causal,
             softcap != 0.0,
             m_block_size,
             n_block_size,
@@ -975,6 +1336,8 @@ def _flash_attn_bwd(
             AtomLayoutNdKV,
             AtomLayoutMdQ,
             V_in_regs,
             cu_seqlens_q is None,
             cu_seqlens_k is None,
             seqused_q is None,
@@ -1043,51 +1406,56 @@ def _flash_attn_bwd(
             if t is not None else None
             for t in (dQ_semaphore, dK_semaphore, dV_semaphore)
         ]
-        fa_bwd_sm80 = FlashAttentionBackwardSm80(
-            dtype,
-            head_dim,
-            head_dim_v,
-            qhead_per_kvhead,
-            m_block_size,
-            n_block_size,
-            num_stages_Q,
-            num_stages_dO,
-            num_threads,
-            pack_gqa,
-            causal,
-            SdP_swapAB,
-            dKV_swapAB,
-            dQ_swapAB,
-            AtomLayoutMSdP,
-            AtomLayoutNdKV,
-            AtomLayoutMdQ,
-            V_in_regs=V_in_regs,
-        )
-        if arch // 10 == 9:
-            fa_bwd_obj = FlashAttentionBackwardSm90(
                 dtype,
                 head_dim,
                 head_dim_v,
                 qhead_per_kvhead,
-                causal,
                 m_block_size,
                 n_block_size,
                 num_stages_Q,
                 num_stages_dO,
-                num_stages_PdS,
                 SdP_swapAB,
                 dKV_swapAB,
                 dQ_swapAB,
                 AtomLayoutMSdP,
                 AtomLayoutNdKV,
                 AtomLayoutMdQ,
-                num_threads,
                 V_in_regs=V_in_regs,
                 score_mod=score_mod,
                 score_mod_bwd=score_mod_bwd,
                 mask_mod=mask_mod,
                 has_aux_tensors=aux_tensors is not None,
                 subtile_factor=subtile_factor,
             )
         else:
             fa_bwd_obj = FlashAttentionBackwardSm100(
@@ -1126,7 +1494,6 @@ def _flash_attn_bwd(
             dk_tensor if not dKV_postprocess else dk_accum_tensor,
             dv_tensor if not dKV_postprocess else dv_accum_tensor,
             softmax_scale,
-            current_stream,
             cu_seqlens_q_tensor,
             cu_seqlens_k_tensor,
             seqused_q_tensor,
@@ -1139,6 +1506,7 @@ def _flash_attn_bwd(
             dV_semaphore_tensor,
             cute_aux_tensors,
             sparse_tensors_compile,
             options="--enable-tvm-ffi",
         )
     if not is_fake_mode():
@@ -1153,7 +1521,6 @@ def _flash_attn_bwd(
             dk if not dKV_postprocess else dk_accum,
             dv if not dKV_postprocess else dv_accum,
             softmax_scale,
-            current_stream,
             cu_seqlens_q,
             cu_seqlens_k,
             seqused_q,
@@ -1168,157 +1535,45 @@ def _flash_attn_bwd(
             normalized_block_sparse_tensors[:4] if normalized_block_sparse_tensors is not None else None,
         )
-    num_threads = 256 if arch // 10 == 9 else 128
-    # Postprocess kernel: convert dq_accum from float32 to dq in bf16/fp16
-    compile_key_post = (
-        arch,
-        dtype,
-        head_dim,
-        m_block_size,
-        num_threads,
-        AtomLayoutMdQ,
-        dQ_swapAB,
-        cu_seqlens_q is None,
-        seqused_q is None,
-        use_2cta_instrs,
-        1, # no cluster for tile_m
-        get_broadcast_dims(dq_accum),
-        get_broadcast_dims(dq),
     )
-    if compile_key_post not in _flash_attn_bwd.compile_cache_post:
-        dq_accum_tensor = to_cute_tensor(dq_accum)
-        dq_tensor = to_cute_tensor(dq)
-        cu_seqlens_q_tensor, seqused_q_tensor = [
-            to_cute_tensor(t, assumed_align=4) if t is not None else None
-            for t in (cu_seqlens_q, seqused_q)
-        ]
-        fa_bwd_post = FlashAttentionBackwardPostprocess(
-            dtype, head_dim, arch, m_block_size, num_threads, AtomLayoutMdQ, dQ_swapAB,
-            use_2cta_instrs=use_2cta_instrs,
-        )
-        # TODO: check @can_implement
-        _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
-            fa_bwd_post,
-            dq_accum_tensor,
-            dq_tensor,
-            softmax_scale,
-            cu_seqlens_q_tensor,
-            seqused_q_tensor,
-            current_stream,
-            options="--enable-tvm-ffi",
-        )
-    if not is_fake_mode():
-        _flash_attn_bwd.compile_cache_post[compile_key_post](
-            dq_accum,
-            dq,
-            softmax_scale,
-            cu_seqlens_q,
-            seqused_q,
-            current_stream,
-        )
     if dKV_postprocess:
-        # Postprocess kernel: convert dk_accum & dv_accum from float32 to bf16/fp16
-        compile_key_post = (
-            arch,
-            dtype,
-            head_dim,
-            n_block_size,
-            num_threads,
-            AtomLayoutNdKV,
-            dKV_swapAB,
-            cu_seqlens_k is None,
-            seqused_k is None,
-            False, # even for 2cta, is split along hdim, so always False
-            cluster_size, # cluster is for tile_n
-            get_broadcast_dims(dk_accum),
-            get_broadcast_dims(dk),
         )
-        if compile_key_post not in _flash_attn_bwd.compile_cache_post:
-            dk_accum_tensor = to_cute_tensor(dk_accum)
-            dk_tensor = to_cute_tensor(dk)
-            cu_seqlens_k_tensor, seqused_k_tensor = [
-                to_cute_tensor(t, assumed_align=4) if t is not None else None
-                for t in (cu_seqlens_k, seqused_k)
-            ]
-            fa_bwd_post = FlashAttentionBackwardPostprocess(
-                dtype, head_dim, arch, n_block_size, num_threads, AtomLayoutNdKV, dKV_swapAB,
-                cluster_size=cluster_size,
-            )
-            # TODO: check @can_implement
-            _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
-                fa_bwd_post,
-                dk_accum_tensor,
-                dk_tensor,
-                softmax_scale,
-                cu_seqlens_k_tensor,
-                seqused_k_tensor,
-                current_stream,
-                options="--enable-tvm-ffi",
-            )
-        if not is_fake_mode():
-            _flash_attn_bwd.compile_cache_post[compile_key_post](
-                dk_accum,
-                dk,
-                softmax_scale,
-                cu_seqlens_k,
-                seqused_k,
-                current_stream,
-            )
-        compile_key_post = (
-            arch,
-            dtype,
-            head_dim_v,
-            n_block_size,
-            num_threads,
-            AtomLayoutNdKV,
-            dKV_swapAB,
-            cu_seqlens_k is None,
-            seqused_k is None,
-            False,
-            cluster_size,
-            get_broadcast_dims(dv_accum),
-            get_broadcast_dims(dv),
         )
-        if compile_key_post not in _flash_attn_bwd.compile_cache_post:
-            dv_accum_tensor = to_cute_tensor(dv_accum)
-            dv_tensor = to_cute_tensor(dv)
-            cu_seqlens_k_tensor, seqused_k_tensor = [
-                to_cute_tensor(t, assumed_align=4) if t is not None else None
-                for t in (cu_seqlens_k, seqused_k)
-            ]
-            fa_bwd_post = FlashAttentionBackwardPostprocess(
-                dtype, head_dim_v, arch, n_block_size, num_threads, AtomLayoutNdKV, dKV_swapAB,
-                cluster_size=cluster_size,
-            )
-            # TODO: check @can_implement
-            _flash_attn_bwd.compile_cache_post[compile_key_post] = cute.compile(
-                fa_bwd_post,
-                dv_accum_tensor,
-                dv_tensor,
-                cutlass.Float32(1.0),
-                cu_seqlens_k_tensor,
-                seqused_k_tensor,
-                current_stream,
-                options="--enable-tvm-ffi",
-            )
-        if not is_fake_mode():
-            _flash_attn_bwd.compile_cache_post[compile_key_post](
-                dv_accum,
-                dv,
-                1.0,
-                cu_seqlens_k,
-                seqused_k,
-                current_stream,
-            )
     return dq, dk, dv
-_flash_attn_bwd.compile_cache_pre = get_jit_cache("bwd_pre")
 _flash_attn_bwd.compile_cache = get_jit_cache("bwd")
-_flash_attn_bwd.compile_cache_post = get_jit_cache("bwd_post")
 class FlashAttnFunc(torch.autograd.Function):
@@ -1376,14 +1631,17 @@ class FlashAttnFunc(torch.autograd.Function):
         ctx.window_size = window_size
         ctx.softcap = softcap
         ctx.deterministic = deterministic
-        # LSE gradient is not supported yet
-        if lse is not None:
-            ctx.mark_non_differentiable(lse)
         return out, lse
     @staticmethod
-    def backward(ctx, dout, *args):
         q, k, v, out, lse = ctx.saved_tensors
         dq, dk, dv = _flash_attn_bwd(
             q,
             k,
@@ -1397,6 +1655,7 @@ class FlashAttnFunc(torch.autograd.Function):
             window_size_left=ctx.window_size[0],
             window_size_right=ctx.window_size[1],
             deterministic=ctx.deterministic,
         )
         return dq, dk, dv, *((None,) * 20)  # Extra Nones is fine
@@ -1458,15 +1717,18 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
         ctx.deterministic = deterministic
         ctx.max_seqlen_q = max_seqlen_q
         ctx.max_seqlen_k = max_seqlen_k
-        # LSE gradient is not supported yet
-        if lse is not None:
-            ctx.mark_non_differentiable(lse)
         return out, lse
     @staticmethod
-    def backward(ctx, dout, *args):
         q, k, v, out, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = ctx.saved_tensors
         assert ctx.softcap == 0.0
         dq, dk, dv = _flash_attn_bwd(
             q,
             k,
@@ -1486,6 +1748,7 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
             max_seqlen_q=ctx.max_seqlen_q,
             max_seqlen_k=ctx.max_seqlen_k,
             deterministic=ctx.deterministic,
         )
         return dq, dk, dv, *((None,) * 20)
@@ -1581,6 +1844,63 @@ def flash_attn_varlen_func(
     )
 def _flash_attn_fwd_combine(
     out_partial: torch.Tensor,
     lse_partial: torch.Tensor,
@@ -1589,6 +1909,7 @@ def _flash_attn_fwd_combine(
     cu_seqlens: Optional[torch.Tensor] = None,
     seqused: Optional[torch.Tensor] = None,
     num_splits_dynamic_ptr: Optional[torch.Tensor] = None,
     semaphore_to_reset: Optional[torch.Tensor] = None,
 ) -> None:
     """Forward combine kernel for split attention computation.
@@ -1612,27 +1933,13 @@ def _flash_attn_fwd_combine(
     Returns:
         None
     """
-    # Input validation
-    assert out_partial.dim() in [4, 5], "out_partial must have 4 or 5 dimensions"
-    assert lse_partial.dim() in [3, 4], "lse_partial must have 3 or 4 dimensions"
     assert out_partial.dtype in [torch.float16, torch.bfloat16, torch.float32], (
         "out_partial must be fp16, bf16, or fp32"
     )
-    assert lse_partial.dtype == torch.float32, "lse_partial must be fp32"
-    assert out_partial.is_cuda and lse_partial.is_cuda, "tensors must be on CUDA device"
-    assert out_partial.stride(-1) == 1, "out_partial must be contiguous in the last dimension"
-    assert lse_partial.stride(-2) == 1, "lse_partial must be contiguous in the seqlen dimension"
-    assert lse_partial.shape == out_partial.shape[:-1]
     # Determine if this is variable length based on dimensions
     is_varlen = out_partial.dim() == 4
-    # Validate output tensor shapes and types
-    assert out.shape == out_partial.shape[1:], "out shape mismatch"
-    if lse is not None:
-        assert lse.shape == lse_partial.shape[1:], "lse shape mismatch"
-        assert lse.dtype == torch.float32, "lse must be fp32"
     # Validate optional tensors
     for t, name in [
         (cu_seqlens, "cu_seqlens"),
@@ -1640,10 +1947,9 @@ def _flash_attn_fwd_combine(
         (num_splits_dynamic_ptr, "num_splits_dynamic_ptr"),
     ]:
         if t is not None:
-            assert t.dtype == torch.int32, f"{name} must be int32"
-            assert t.is_cuda, f"{name} must be on CUDA device"
             assert t.is_contiguous(), f"{name} must be contiguous"
     head_dim = out_partial.shape[-1]
     num_splits = out_partial.shape[0]
     assert num_splits <= 256
@@ -1652,101 +1958,37 @@ def _flash_attn_fwd_combine(
     k_block_size = 64 if head_dim <= 64 else 128
     # We want kBlockM to be as small as possible to maximize parallelism.
     # E.g., if hdim is 64, we want kBlockM to be 16 so that we can use 256 threads, each reading 4 elements (floats).
-    m_block_size = 8 if k_block_size % 128 == 0 else (16 if k_block_size % 64 == 0 else 32)
     log_max_splits = max(math.ceil(math.log2(num_splits)), 4)
-    if m_block_size == 8:
         # If kBlockM == 8 then the minimum number of splits is 32.
         # TODO: we can deal w this by using 128 threads instead
         log_max_splits = max(log_max_splits, 5)
-    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
     # Create combine kernel configuration
     dtype = torch2cute_dtype_map[out.dtype]
     dtype_partial = torch2cute_dtype_map[out_partial.dtype]
     compile_key = (
         dtype,
         dtype_partial,
         head_dim,
-        m_block_size,
         k_block_size,
         log_max_splits,
         cu_seqlens is not None,
         seqused is not None,
         lse is not None,
     )
     if compile_key not in _flash_attn_fwd_combine.compile_cache:
-        out_partial_tensor = to_cute_tensor(
-            out_partial, leading_dim=4 if not is_varlen else 3
-        )
-        lse_partial_tensor = to_cute_tensor(
-            lse_partial, assumed_align=4, leading_dim=lse_partial.ndim - 2
-        )
-        out_tensor = to_cute_tensor(out, leading_dim=3 if not is_varlen else 2)
-        lse_tensor = (
-            to_cute_tensor(lse, assumed_align=4, leading_dim=lse.ndim - 2)
-            if lse is not None
-            else None
-        )
-        optional_tensors = [
-            to_cute_tensor(t, assumed_align=4, leading_dim=0)
-            if t is not None
-            else None
-            for t in (cu_seqlens, seqused, num_splits_dynamic_ptr, semaphore_to_reset)
-        ]
-        cu_seqlens_tensor, seqused_tensor, num_splits_dynamic_tensor, semaphore_tensor = (
-            optional_tensors
-        )
-        fa_combine = FlashAttentionForwardCombine(
-            dtype=dtype,
-            dtype_partial=dtype_partial,
-            head_dim=head_dim,
-            m_block_size=m_block_size,
-            k_block_size=k_block_size,
-            log_max_splits=log_max_splits,
-        )
-        # Check if implementation is supported
-        if not fa_combine.can_implement(
-            dtype,
-            dtype_partial,
-            head_dim,
-            m_block_size,
-            k_block_size,
-            log_max_splits,
-            num_threads=256,
-        ):
-            raise RuntimeError(
-                "FlashAttention combine kernel cannot be implemented with given parameters"
-            )
-        _flash_attn_fwd_combine.compile_cache[compile_key] = cute.compile(
-            fa_combine,
-            out_partial_tensor,
-            lse_partial_tensor,
-            out_tensor,
-            lse_tensor,
-            cu_seqlens_tensor,
-            seqused_tensor,
-            num_splits_dynamic_tensor,
-            semaphore_tensor,
-            current_stream,
-            options="--enable-tvm-ffi",
         )
     if not is_fake_mode():
         _flash_attn_fwd_combine.compile_cache[compile_key](
-            out_partial,
-            lse_partial,
-            out,
-            lse,
-            cu_seqlens,
-            seqused,
-            num_splits_dynamic_ptr,
             semaphore_to_reset,
-            current_stream,
         )
@@ -1760,6 +2002,7 @@ def flash_attn_combine(
     out_dtype: Optional[torch.dtype] = None,
     cu_seqlens: Optional[torch.Tensor] = None,
     seqused: Optional[torch.Tensor] = None,
     return_lse: bool = True,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Flash Attention combine function for split attention computation.
@@ -1779,6 +2022,9 @@ def flash_attn_combine(
         out_dtype: Optional output dtype. If None, will use fp16/bf16 based on input.
         cu_seqlens: Cumulative sequence lengths for variable length sequences
         seqused: Used sequence lengths for each batch
         return_lse: Whether to return the combined LSE tensor. Default is True.
     Returns:
@@ -1795,32 +2041,19 @@ def flash_attn_combine(
     """
     # Input validation
     assert out_partial.dim() in [4, 5], "out_partial must have 4 or 5 dimensions"
-    assert lse_partial.dim() in [3, 4], "lse_partial must have 3 or 4 dimensions"
-    assert out_partial.dtype == torch.float32, "out_partial must be fp32 (from accumulation)"
-    assert lse_partial.dtype == torch.float32, "lse_partial must be fp32"
     # Determine if this is variable length based on dimensions
     is_varlen = out_partial.dim() == 4
     if is_varlen:
         # Variable length: (num_splits, total_q, num_heads, head_size)
         num_splits, total_q, num_heads, head_size = out_partial.shape
-        assert lse_partial.shape == (num_splits, total_q, num_heads), (
-            "lse_partial shape mismatch for varlen"
-        )
         batch_size = 1  # Treat as single batch for varlen
         seqlen = total_q
     else:
         # Regular batched: (num_splits, batch_size, seqlen, num_heads, head_size)
         num_splits, batch_size, seqlen, num_heads, head_size = out_partial.shape
-        assert lse_partial.shape == (num_splits, batch_size, seqlen, num_heads), (
-            "lse_partial shape mismatch"
-        )
     # Determine output dtype
     if out_dtype is None:
         out_dtype = out_partial.dtype
     # Create output if not provided
     device = out_partial.device
     if out is None:
@@ -1830,20 +2063,15 @@ def flash_attn_combine(
             out = torch.empty(
                 batch_size, seqlen, num_heads, head_size, dtype=out_dtype, device=device
             )
     # Create lse output only if requested
     if return_lse:
         if is_varlen:
-            lse = torch.empty(num_heads, total_q, dtype=torch.float32, device=device).transpose(
-                0, 1
-            )
         else:
-            lse = torch.empty(
-                batch_size, num_heads, seqlen, dtype=torch.float32, device=device
-            ).transpose(1, 2)
     else:
         lse = None
     _flash_attn_fwd_combine(
         out_partial,
         lse_partial,
@@ -1851,5 +2079,6 @@ def flash_attn_combine(
         lse,
         cu_seqlens,
         seqused,
     )
     return out, lse

 import os
 import math
+from dataclasses import dataclass
 from functools import lru_cache
 from typing import Optional, Tuple, Callable
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Float32
+from .quack.compile_utils import make_fake_tensor as fake_tensor
 from .cache_utils import get_jit_cache
 from .testing import is_fake_mode
 from . import utils
+from . import fa_logging
 from .cute_dsl_utils import (
     to_cute_tensor, to_cute_aux_tensor, get_aux_tensor_metadata, get_broadcast_dims,
 )
+from .flash_fwd import FlashAttentionForwardSm80
+from .flash_fwd_sm90 import FlashAttentionForwardSm90
 from .flash_fwd_sm100 import FlashAttentionForwardSm100
+from .flash_fwd_sm120 import FlashAttentionForwardSm120
 from .flash_bwd_preprocess import FlashAttentionBackwardPreprocess
 from .flash_bwd import FlashAttentionBackwardSm80
 from .flash_bwd_sm90 import FlashAttentionBackwardSm90
 from .flash_bwd_sm100 import FlashAttentionBackwardSm100
+from .flash_bwd_sm120 import FlashAttentionBackwardSm120
 from .flash_bwd_postprocess import FlashAttentionBackwardPostprocess
 from .flash_fwd_combine import FlashAttentionForwardCombine
 from .block_sparsity import (
     BlockSparseTensorsTorch,
+    get_sparse_q_block_size,
     to_cute_block_sparse_tensors,
     normalize_block_sparse_config,
     normalize_block_sparse_config_bwd,
 )
+def _parse_arch_str(arch_str):
+    """Parse arch string (e.g. 'sm_80', 'sm_90a', '80', '100') to int (e.g. 80, 90, 100)."""
+    import re
+    match = re.match(r"^(?:sm_?|SM_?)?(\d+)(\d)([af]?)$", arch_str)
+    if not match:
+        raise ValueError(f"Invalid arch format: {arch_str}")
+    major, minor, _ = match.groups()
+    return int(major) * 10 + int(minor)
 @lru_cache(maxsize=None)
 def _get_device_arch():
+    """Cached device arch check.
+    Override with FLASH_ATTENTION_ARCH (e.g. 'sm_80' or '80') to select which
+    kernel path to use (SM80/SM90/SM100/SM120) independently of the compilation
+    target (CUTE_DSL_ARCH).
+    For CPU-only compilation (no GPU), set both:
+      FLASH_ATTENTION_ARCH=sm_80  (kernel selection)
+      CUTE_DSL_ARCH=sm_80         (compilation target)
+    """
+    arch_override = os.environ.get("FLASH_ATTENTION_ARCH", None)
+    if arch_override is not None:
+        return _parse_arch_str(arch_override)
     major, minor = torch.cuda.get_device_capability()
+    return major * 10 + int(minor)
+def _validate_head_dims(head_dim: int, head_dim_v: int, compute_capability: int, alignment: int) -> None:
+    """Validate head dimension constraints based on compute capability."""
+    is_deepseek_shape = head_dim == 192 and head_dim_v == 128
+    is_standard_range = 8 <= head_dim <= 128 and 8 <= head_dim_v <= 128
+    is_sm90_range = 8 <= head_dim <= 256 and 8 <= head_dim_v <= 256
+    if compute_capability == 9:
+        assert is_sm90_range and head_dim % alignment == 0 and head_dim_v % alignment == 0, (
+            f"(head_dim, head_dim_v)=({head_dim}, {head_dim_v}) is not supported on SM90. "
+            f"head_dim and head_dim_v must be between 8 and 256 and divisible by {alignment}."
+        )
+    elif compute_capability in [10, 11]:
+        assert (is_standard_range or is_deepseek_shape) and head_dim % alignment == 0 and head_dim_v % alignment == 0, (
+            f"(head_dim, head_dim_v)=({head_dim}, {head_dim_v}) is not supported on SM100/SM110. "
+            f"head_dim and head_dim_v must be between 8 and 128 and divisible by {alignment}, or (192, 128) for DeepSeek."
+        )
+@dataclass(frozen=True)
+class FwdConfig:
+    m_block_size: int
+    n_block_size: int
+    mma_pv_is_rs: bool
+    intra_wg_overlap: bool
+def _tile_size_fwd_sm90(head_dim, head_dim_v, is_causal, is_local, sparse_block_size_q=None):
+    """Return FwdConfig for SM90 forward.
+    Tile sizes and flags based on tile_size_fwd_sm90 in hopper/tile_size.h, adjusted
+    for the Python kernel's different register/smem tradeoffs (benchmarked on H100 SXM).
+    When sparse_block_size_q is set, tile_m must divide it. For head_dim <= 96 the
+    optimal tile_m=192 is used when compatible, otherwise we fall back to 128.
+    """
+    if head_dim <= 64:
+        # C++: 192×192 non-causal, 192×128 causal/local.
+        # Python: 192×128 RS+OL is consistently best across seqlens.
+        if sparse_block_size_q is not None and sparse_block_size_q % 192 != 0:
+            return FwdConfig(128, 128, True, True)
+        return FwdConfig(192, 128, True, True)
+    elif head_dim <= 96:
+        # C++: 192×144 noRS+OL for all cases.
+        # Python: RS is catastrophic with 192× tiles (~300 vs ~600 TFLOPS).
+        # noRS+OL is always required. Causal: 192×128 slightly better short seqlen.
+        if sparse_block_size_q is not None and sparse_block_size_q % 192 != 0:
+            return FwdConfig(128, 128, False, True)
+        if is_causal or is_local:
+            return FwdConfig(192, 128, False, True)
+        else:
+            return FwdConfig(192, 144, False, True)
+    elif head_dim <= 128:
+        return FwdConfig(128, 128, True, True)
+    elif head_dim <= 192:
+        tile_n = 96 if is_local else (128 if head_dim_v <= 128 else 112)
+        return FwdConfig(128, tile_n, True, True)
+    else:  # hdim 256
+        tile_n = 64 if is_local else 80
+        return FwdConfig(128, tile_n, True, True)
+@dataclass(frozen=True)
+class BwdConfig:
+    m_block_size: int
+    n_block_size: int
+    num_stages_Q: int
+    num_stages_dO: int
+    num_stages_PdS: int
+    SdP_swapAB: bool
+    dKV_swapAB: bool
+    dQ_swapAB: bool
+    AtomLayoutMSdP: int
+    AtomLayoutNdKV: int
+    AtomLayoutMdQ: int
+    num_wg: int = 2  # MMA warp groups (total threads = (num_wg + 1) * 128)
+    dQ_single_wg: bool = False
+def _tile_size_bwd_sm90(head_dim, head_dim_v, causal, local, sparse_block_size_q=None):
+    """Return BwdConfig for SM90.
+    Configs based on C++ FA3 hopper/flash_bwd_launch_template.h,
+    benchmarked on H100 SXM.
+    """
+    if head_dim <= 64:
+        # C++ FA3: 128, 128, 64, ..., 2, 2, true, false, false, 2, 1, 2, 2
+        return BwdConfig(
+            m_block_size=128, n_block_size=128,
+            num_stages_Q=2, num_stages_dO=2, num_stages_PdS=2,
+            SdP_swapAB=True, dKV_swapAB=False, dQ_swapAB=False,
+            AtomLayoutMSdP=1, AtomLayoutNdKV=2, AtomLayoutMdQ=2,
+        )
+    elif head_dim <= 96:
+        # C++ FA3: 64, 128, 96, dQ_swapAB=False
+        return BwdConfig(
+            m_block_size=64, n_block_size=128,
+            num_stages_Q=2, num_stages_dO=2, num_stages_PdS=2,
+            SdP_swapAB=True, dKV_swapAB=False, dQ_swapAB=False,
+            AtomLayoutMSdP=1, AtomLayoutNdKV=2, AtomLayoutMdQ=1,
+            dQ_single_wg=True,
+        )
+    elif head_dim <= 128:
+        # C++ FA3: causal/local: 64, 128; non-causal: 80, 128 with dQ_swapAB
+        is_causal_or_local = causal or local
+        m_block_size = 64 if is_causal_or_local else 80
+        if sparse_block_size_q is not None and sparse_block_size_q % m_block_size != 0:
+            m_block_size = 64
+        return BwdConfig(
+            m_block_size=m_block_size,
+            n_block_size=128,
+            num_stages_Q=2, num_stages_dO=2, num_stages_PdS=2,
+            SdP_swapAB=True, dKV_swapAB=False,
+            dQ_swapAB=m_block_size % 64 != 0,
+            AtomLayoutMSdP=1, AtomLayoutNdKV=2, AtomLayoutMdQ=1,
+        )
+    elif head_dim <= 192:
+        hdimv128 = head_dim_v <= 128
+        if hdimv128:
+            return BwdConfig(
+                m_block_size=64, n_block_size=96,
+                num_stages_Q=2, num_stages_dO=2, num_stages_PdS=1,
+                SdP_swapAB=False, dKV_swapAB=True, dQ_swapAB=False,
+                AtomLayoutMSdP=1, AtomLayoutNdKV=2, AtomLayoutMdQ=1,
+                num_wg=2,
+            )
+        else:
+            return BwdConfig(
+                m_block_size=64, n_block_size=96,
+                num_stages_Q=2, num_stages_dO=1, num_stages_PdS=1,
+                SdP_swapAB=False, dKV_swapAB=True, dQ_swapAB=False,
+                AtomLayoutMSdP=1, AtomLayoutNdKV=2, AtomLayoutMdQ=1,
+                num_wg=2,
+            )
+    else:
+        # hdim 256
+        return BwdConfig(
+            m_block_size=64, n_block_size=64,
+            num_stages_Q=1, num_stages_dO=1, num_stages_PdS=1,
+            SdP_swapAB=False, dKV_swapAB=False, dQ_swapAB=False,
+            AtomLayoutMSdP=1, AtomLayoutNdKV=1, AtomLayoutMdQ=1,
+        )
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
     assert t.shape == expected_shape, f"{name} shape {t.shape} != expected {expected_shape}"
     assert t.dtype == expected_dtype, f"{name} dtype {t.dtype} != expected {expected_dtype}"
     assert t.device == expected_device, f"{name} device {t.device} != expected {expected_device}"
+    if not is_fake_mode():
+        assert t.is_cuda, f"{name} must be on CUDA"
 torch2cute_dtype_map = {
     return min(num_SMs // total_mblocks, max_splits, num_n_blocks)
+def _resolve_causal_local_window(causal, window_size_left, window_size_right, mask_mod=None):
+    """Resolve causal/local/window settings into canonical form.
+    Returns (causal, local, window_size_left, window_size_right).
+    """
+    if mask_mod is not None:
+        return False, False, window_size_left, window_size_right
+    if causal:
+        window_size_right = 0
+    if window_size_left is not None and window_size_right is not None and window_size_left + window_size_right < 0:
+        window_size_left = None
+        window_size_right = None
+    if window_size_left is not None or window_size_right is not None:
+        if window_size_left is None and window_size_right == 0:
+            causal, local = True, False
+            window_size_right = None
+        else:
+            causal, local = False, True
+    else:
+        local = False
+    return causal, local, window_size_left, window_size_right
 def _flash_attn_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
     window_size_left: Optional[int] = None,
     window_size_right: Optional[int] = None,
     learnable_sink: Optional[torch.Tensor] = None,
+    tile_mn: Optional[Tuple[int, int]] = None,
+    mma_pv_is_rs: Optional[bool] = None,
+    intra_wg_overlap: Optional[bool] = None,
     num_threads: int = 384,
     num_splits: int = 1,
     pack_gqa: Optional[bool] = None,
         mask_mod: A callable that takes token position information and selectively masks
         block_sparse_tensors: A tuple of tensors used for block sparsity.
         return_lse: Whether to return the log softmax of the attention scores. If set to True will always calculate
+            The returned LSE supports taking gradient.
         out: Optional pre-allocated output tensor. If None, will be allocated internally.
         lse: Optional pre-allocated log-sum-exp tensor. If None, will be allocated when needed.
         aux_tensors: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.
         assert learnable_sink.shape == (num_head,)
         assert learnable_sink.dtype == torch.bfloat16, "learnable_sink must be bfloat16"
+    if not is_fake_mode():
+        assert all(
+            t is None or t.is_cuda
+            for t in (
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                seqused_q,
+                seqused_k,
+                page_table,
+                learnable_sink,
+            )
+        ), "inputs must be on CUDA device"
+    arch = _get_device_arch() if _arch is None else _arch
+    assert arch // 10 in [8, 9, 10, 11, 12], "Unsupported compute capability. Supported: 8.x, 9.x, 10.x, 11.x, 12.x"
     assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
     alignment = 16 // q.element_size()
+    if arch // 10 not in [8, 12]:
+        _validate_head_dims(head_dim, head_dim_v, arch // 10, alignment)
     if softmax_scale is None:
         softmax_scale = 1.0 / math.sqrt(head_dim)
     if softcap == 0.0:
         _validate_tensor(lse, "lse", lse_shape, torch.float32, device)
     dtype = torch2cute_dtype_map[q.dtype]
+    use_block_sparsity = block_sparse_tensors is not None
+    causal, local, window_size_left, window_size_right = _resolve_causal_local_window(
+        causal, window_size_left, window_size_right, mask_mod
+    )
+    requested_use_clc_scheduler = utils._get_use_clc_scheduler_default()
+    requested_disable_2cta = utils._get_disable_2cta_default()
+    current_stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
+    # SM80/SM120: uses SM80 MMA, 128 threads (4 warps)
+    if arch // 10 in [8, 12]:
+        num_threads = 128
+    fwd_cfg = FwdConfig(128, 128, True, True)  # default
+    if tile_mn is None:
+        if arch // 10 == 12:
+            # SM120 tile sizes tuned for 99 KB SMEM capacity:
+            # D<=64:  128x128 → 48 KB (good occupancy)
+            # D>64:   128x64  → 64 KB (128x128 would use 96 KB, hurting occupancy)
+            if head_dim <= 64:
+                fwd_cfg = FwdConfig(128, 128, True, True)
             else:
+                fwd_cfg = FwdConfig(128, 64, True, True)
+        elif arch // 10 == 8:
+            fwd_cfg = FwdConfig(128, 64, True, True)  # SM80, should tune
+        elif arch // 10 == 9:
+            sparse_q = get_sparse_q_block_size(block_sparse_tensors, seqlen_q)
+            fwd_cfg = _tile_size_fwd_sm90(head_dim, head_dim_v, causal, local, sparse_block_size_q=sparse_q)
     else:
+        fwd_cfg = FwdConfig(tile_mn[0], tile_mn[1], fwd_cfg.mma_pv_is_rs, fwd_cfg.intra_wg_overlap)
+    tile_m, tile_n = fwd_cfg.m_block_size, fwd_cfg.n_block_size
+    if mma_pv_is_rs is None:
+        mma_pv_is_rs = fwd_cfg.mma_pv_is_rs
+    if intra_wg_overlap is None:
+        intra_wg_overlap = fwd_cfg.intra_wg_overlap
+    # TODO: fix GQA + SplitKV + non-varlen
+    if pack_gqa and num_splits != 1 and cu_seqlens_q is None:
+        pack_gqa = False
     if max_seqlen_q is None:
         max_seqlen_q = seqlen_q if cu_seqlens_q is None else total_q
         max_seqlen_k = seqlen_k
     seqlen_q_packgqa = max_seqlen_q * qhead_per_kvhead
     if arch // 10 == 10:
+        q_stage = 2 if seqlen_q_packgqa > tile_m else 1
     else:
         q_stage = 1
+    m_block_size_effective = q_stage * tile_m
+    seqlen_k_loaded = max_seqlen_k if not local else max(0, min(max_seqlen_k, (window_size_right or max_seqlen_k) + (window_size_left or max_seqlen_k) + 1 + tile_m))
+    num_m_blocks = (seqlen_q_packgqa + m_block_size_effective - 1) // m_block_size_effective
+    total_mblocks = batch_size * num_head_kv * num_m_blocks
+    num_n_blocks = (seqlen_k_loaded + tile_n - 1) // tile_n
+    num_SMs = 132 if is_fake_mode() else torch.cuda.get_device_properties(device).multi_processor_count
     if num_splits < 1:
+        num_splits = num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, 128)
+    # SplitKV uses float32 partial output, which doubles the O buffer size
+    # in shared memory, causing OOM for diff-headdim (192, 128)
+    if arch // 10 in [10, 11] and head_dim != head_dim_v and num_splits > 1:
+        if num_n_blocks >= 64:
+            tile_n = 64
+            num_n_blocks = (seqlen_k_loaded + tile_n - 1) // tile_n
+            num_splits = num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, 128)
+        else:
+            num_splits = 1
     is_split_kv = num_splits > 1
     if is_split_kv:
         out_partial = torch.empty(num_splits, *q_batch_seqlen_shape, num_head, head_dim_v, dtype=torch.float32, device=device)
         lse_partial = torch.empty(num_splits, *lse_shape, dtype=torch.float32, device=device)
+    use_2cta_instrs = (
+        arch // 10 in [10, 11]
+        and not requested_disable_2cta
+        and not causal
+        and not local
+        and not is_split_kv
+        and cu_seqlens_q is None
+        and seqused_q is None
+        and not use_block_sparsity
+        and page_size in [None, 128]
+        and int(math.ceil(head_dim / 16) * 16) in [128, 192]
+        and int(math.ceil(head_dim_v / 16) * 16) == 128
+        and seqlen_q_packgqa > 2 * tile_m
+        and (tile_m % qhead_per_kvhead == 0 or not pack_gqa)
+    )
     # hash score and mask mods for compile cache
     score_mod_hash = utils.hash_callable(score_mod) if score_mod is not None else False
     mask_mod_hash = utils.hash_callable(mask_mod) if mask_mod is not None else False
             num_head=num_head,
             seqlen_q=seqlen_q,
             seqlen_k=seqlen_k,
+            block_size=(tile_m, tile_n),
             q_stage=q_stage,
         )
+    if aux_tensors is not None:
         aux_tensor_metadata = get_aux_tensor_metadata(aux_tensors)
     else:
         aux_tensor_metadata = None
     compile_key = (
         dtype,
         head_dim,
         window_size_left is not None,
         window_size_right is not None,
         learnable_sink is not None,
+        tile_m,
+        tile_n,
         q_stage,
         num_threads,
         is_split_kv,
         pack_gqa,
         arch,
+        page_size not in [None, tile_n],  # paged KV non-TMA
+        use_2cta_instrs,
         q_subtile_factor,
+        mma_pv_is_rs,
+        intra_wg_overlap,
+        requested_use_clc_scheduler,
+        fa_logging.get_fa_log_level(),
     )
     if compile_key not in _flash_attn_fwd.compile_cache:
         (
         if aux_tensors is not None:
             cute_aux_tensors = [to_cute_aux_tensor(buf) for buf in aux_tensors]
+        if arch // 10 == 8:
+            assert page_table is None, "paged KV not supported on SM 8.0"
+            assert not is_split_kv, "SplitKV not supported on SM 8.0"
+            fa_fwd = FlashAttentionForwardSm80(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                pack_gqa=pack_gqa,
+                tile_m=tile_m,
+                tile_n=tile_n,
+                num_stages=1,
+                num_threads=num_threads,
+                Q_in_regs=False,
+                score_mod=score_mod,
+                mask_mod=mask_mod,
+                has_aux_tensors=aux_tensors is not None,
+            )
+        elif arch // 10 == 9:
             assert not is_split_kv, "SplitKV not supported on SM 9.0"
             fa_fwd = FlashAttentionForwardSm90(
                 dtype,
                 head_dim,
                 is_causal=causal,
                 is_local=local,
                 pack_gqa=pack_gqa,
+                tile_m=tile_m,
+                tile_n=tile_n,
                 # num_stages=1,
                 num_stages=2,
                 num_threads=num_threads,
                 Q_in_regs=False,
+                intra_wg_overlap=intra_wg_overlap,
+                mma_pv_is_rs=mma_pv_is_rs,
                 mask_mod=mask_mod,
                 score_mod=score_mod,
                 has_aux_tensors=aux_tensors is not None,
                 q_subtile_factor=q_subtile_factor,
+                paged_kv_non_tma=page_size not in [None, tile_n],
             )
         elif arch // 10 in [10, 11]:
             fa_fwd = FlashAttentionForwardSm100(
                 head_dim,
                 head_dim_v,
                 is_local=local,
                 is_split_kv=is_split_kv,
                 pack_gqa=pack_gqa,
+                m_block_size=tile_m,
+                n_block_size=tile_n,
                 q_stage=q_stage,
                 is_persistent=not causal
                     and not local
                 score_mod=score_mod,
                 mask_mod=mask_mod,
                 has_aux_tensors=aux_tensors is not None,
+                paged_kv_non_tma=page_size not in [None, tile_n],
                 is_varlen_q=cu_seqlens_q is not None or seqused_q is not None,
                 q_subtile_factor=q_subtile_factor,
                 use_2cta_instrs=use_2cta_instrs,
+                use_clc_scheduler=requested_use_clc_scheduler,
+            )
+        elif arch // 10 == 12:
+            # SM120 (Blackwell GeForce / DGX Spark): uses SM80 MMA with SM120 SMEM capacity
+            assert not use_block_sparsity, "Block sparsity not supported on SM 12.0"
+            assert page_table is None, "Paged KV not supported on SM 12.0 in this PR"
+            assert not is_split_kv, "SplitKV not supported on SM 12.0 in this PR"
+            fa_fwd = FlashAttentionForwardSm120(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                pack_gqa=pack_gqa,
+                tile_m=tile_m,
+                tile_n=tile_n,
+                num_stages=1,
+                num_threads=num_threads,
+                Q_in_regs=False,
+                score_mod=score_mod,
+                mask_mod=mask_mod,
+                has_aux_tensors=aux_tensors is not None,
             )
         else:
             raise ValueError(
+                f"Unsupported compute capability: {arch}. Supported: 8.x, 9.x, 10.x, 11.x, 12.x"
             )
         # TODO: check @can_implement
         _flash_attn_fwd.compile_cache[compile_key] = cute.compile(
             o_tensor,
             lse_tensor,
             softmax_scale,
             cu_seqlens_q_tensor,
             cu_seqlens_k_tensor,
             seqused_q_tensor,
             learnable_sink_tensor,
             sparse_tensors,
             cute_aux_tensors,
+            current_stream,
             options="--enable-tvm-ffi",
         )
             out.detach() if not is_split_kv else out_partial,
             lse_partial if is_split_kv else lse,
             softmax_scale,
             cu_seqlens_q,
             cu_seqlens_k,
             seqused_q,
 _flash_attn_fwd.compile_cache = get_jit_cache("fwd")
+def make_fake_bwd_tensors(dtype, has_gqa, varlen_q, varlen_k):
+    sym = cute.sym_int
+    # divisibility in elements: assumed_align_bytes = divisibility * dtype.width // 8
+    # For 16-byte align: fp16/bf16 → divisibility=8, float32 → divisibility=4
+    div = 128 // dtype.width  # 8 for fp16/bf16
+    # Shared sym_ints for dimensions that must match across tensors
+    b, seqlen_q, seqlen_k, h_q, d, d_v = sym(), sym(), sym(), sym(), sym(), sym()
+    h_kv = h_q if not has_gqa else sym()
+    seqlen_q_rounded, seqlen_k_rounded = sym(), sym()
+    seqlen_q_d_rounded, seqlen_k_d_rounded, seqlen_k_dv_rounded = sym(), sym(), sym()
+    total_q, total_k, total_q_rounded, total_k_rounded = sym(), sym(), sym(), sym()
+    total_q_d_rounded, total_k_d_rounded, total_k_dv_rounded = sym(), sym(), sym()
+    b_seqlenq = (b, seqlen_q) if not varlen_q else (total_q,)
+    b_seqlenk = (b, seqlen_k) if not varlen_k else (total_k,)
+    mQ = fake_tensor(dtype, (*b_seqlenq, h_q, d), divisibility=div)
+    mO = fake_tensor(dtype, (*b_seqlenq, h_q, d_v), divisibility=div)
+    mdO = fake_tensor(dtype, (*b_seqlenq, h_q, d_v), divisibility=div)
+    mK = fake_tensor(dtype, (*b_seqlenk, h_kv, d), divisibility=div)
+    mV = fake_tensor(dtype, (*b_seqlenk, h_kv, d_v), divisibility=div)
+    mdQ = fake_tensor(dtype, (*b_seqlenq, h_q, d), divisibility=div)
+    mdK = fake_tensor(dtype, (*b_seqlenk, h_kv, d), divisibility=div)
+    mdV = fake_tensor(dtype, (*b_seqlenk, h_kv, d_v), divisibility=div)
+    if not varlen_q:
+        mLSE = fake_tensor(Float32, (b, h_q, seqlen_q), divisibility=1)
+        mLSElog2 = fake_tensor(Float32, (b, h_q, seqlen_q_rounded), divisibility=4)
+        mPdPsum = fake_tensor(Float32, (b, h_q, seqlen_q_rounded), divisibility=4)
+        dQaccum = fake_tensor(Float32, (b, h_q, seqlen_q_d_rounded), divisibility=4)
+    else:
+        mLSE = fake_tensor(Float32, (h_q, total_q), divisibility=1)
+        mLSElog2 = fake_tensor(Float32, (h_q, total_q_rounded), divisibility=4)
+        mPdPsum = fake_tensor(Float32, (h_q, total_q_rounded), divisibility=4)
+        dQaccum = fake_tensor(Float32, (h_q, total_q_d_rounded), divisibility=4)
+    if not has_gqa:
+        mdKaccum, mdVaccum = None, None
+    else:
+        if not varlen_k:
+            mdKaccum = fake_tensor(Float32, (b, h_kv, seqlen_k_rounded), divisibility=4)
+            mdVaccum = fake_tensor(Float32, (b, h_kv, seqlen_k_dv_rounded), divisibility=4)
+        else:
+            mdKaccum = fake_tensor(Float32, (h_kv, total_k_rounded), divisibility=4)
+            mdVaccum = fake_tensor(Float32, (h_kv, total_k_dv_rounded), divisibility=4)
+    return mQ, mK, mV, mO, mdO, mdQ, mdK, mdV, mLSE, mLSElog2, mPdPsum, dQaccum, mdKaccum, mdVaccum
+def _compile_bwd_preprocess(
+    dtype, head_dim, head_dim_v, m_block_size, has_cuseqlens_q, has_seqused_q, has_dlse,
+):
+    """Compile bwd preprocess kernel using cute fake tensors (no real GPU tensors needed)."""
+    mQ, mK, mV, mO, mdO, mdQ, mdK, mdV, mLSE, mLSElog2, mPdPsum, mdQaccum, mdKaccum, mdVaccum = make_fake_bwd_tensors(
+        dtype, has_gqa=True, varlen_q=has_cuseqlens_q, varlen_k=False
+    )
+    batch = mQ.shape[0] if not has_cuseqlens_q else cute.sym_int()
+    batchp1 = cute.sym_int()
+    mCuSeqlensQ = fake_tensor(Int32, (batchp1,), divisibility=1) if has_cuseqlens_q else None
+    mSequsedQ = fake_tensor(Int32, (batch,), divisibility=1) if has_seqused_q else None
+    mdLSE = fake_tensor(Float32, mLSE.shape, divisibility=1) if has_dlse else None
+    fa_bwd_pre = FlashAttentionBackwardPreprocess(dtype, head_dim, head_dim_v, m_block_size)
+    return cute.compile(
+        fa_bwd_pre, mO, mdO, mPdPsum, mLSE, mLSElog2, mdQaccum, mCuSeqlensQ, mSequsedQ, mdLSE,
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
+def _bwd_preprocess(
+    out, dout, dpsum, lse, lse_log2, dq_accum,
+    cu_seqlens_q, seqused_q, dlse,
+    dtype, head_dim, head_dim_v, m_block_size,
+):
+    """Backward preprocess: compute (o * dout).sum(dim=-1) - dLSE, lse * log2_e, and zero out dq_accum."""
+    is_varlen = cu_seqlens_q is not None
+    compile_key = (
+        dtype, head_dim, head_dim_v, m_block_size, is_varlen, seqused_q is not None, dlse is not None,
+    )
+    if compile_key not in _bwd_preprocess.compile_cache:
+        _bwd_preprocess.compile_cache[compile_key] = _compile_bwd_preprocess(*compile_key)
+    if not is_fake_mode():
+        _bwd_preprocess.compile_cache[compile_key](
+            out, dout, dpsum, lse, lse_log2, dq_accum, cu_seqlens_q, seqused_q, dlse
+        )
+_bwd_preprocess.compile_cache = get_jit_cache("bwd_pre")
+def _compile_bwd_postprocess(
+    dtype, hdim, block_size, num_threads, atom_layout, swap_ab,
+    has_cuseqlens_q, has_seqused_q,
+    use_2cta_instrs, cluster_size, arch,
+):
+    """Compile bwd postprocess kernel using cute fake tensors."""
+    mQ, mK, mV, mO, mdO, mdQ, mdK, mdV, mLSE, mLSElog2, mPdPsum, mdQaccum, mdKaccum, mdVaccum = make_fake_bwd_tensors(
+        dtype, has_gqa=True, varlen_q=has_cuseqlens_q, varlen_k=False
+    )
+    batch = mQ.shape[0] if not has_cuseqlens_q else cute.sym_int()
+    batchp1 = cute.sym_int()
+    mCuSeqlensQ = fake_tensor(Int32, (batchp1,), divisibility=1) if has_cuseqlens_q else None
+    mSeqUsedQ = fake_tensor(Int32, (batch,), divisibility=1) if has_seqused_q else None
+    fa_bwd_post = FlashAttentionBackwardPostprocess(
+        dtype, hdim, arch, block_size, num_threads, atom_layout, swap_ab,
+        use_2cta_instrs=use_2cta_instrs,
+        cluster_size=cluster_size,
+    )
+    return cute.compile(
+        fa_bwd_post, mdQaccum, mdQ, Float32(0.0), mCuSeqlensQ, mSeqUsedQ,
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
+def _bwd_postprocess_convert(
+    accum, output, scale,
+    cu_seqlens, seqused,
+    arch, dtype, hdim, block_size, num_threads,
+    atom_layout, swap_ab,
+    use_2cta_instrs=False, cluster_size=1,
+):
+    """Backward postprocess: convert float32 accumulator to bf16/fp16 output."""
+    compile_key = (
+        dtype, hdim, block_size, num_threads, atom_layout, swap_ab,
+        cu_seqlens is not None, seqused is not None,
+        use_2cta_instrs, cluster_size, arch,
+    )
+    if compile_key not in _bwd_postprocess_convert.compile_cache:
+        _bwd_postprocess_convert.compile_cache[compile_key] = _compile_bwd_postprocess(*compile_key)
+    if not is_fake_mode():
+        _bwd_postprocess_convert.compile_cache[compile_key](
+            accum, output, scale, cu_seqlens, seqused,
+        )
+_bwd_postprocess_convert.compile_cache = get_jit_cache("bwd_post")
 def _flash_attn_bwd(
     q: torch.Tensor,
     k: torch.Tensor,
     mask_mod: Optional[Callable] = None,
     aux_tensors: Optional[list[torch.Tensor]] = None,
     block_sparse_tensors: Optional[BlockSparseTensorsTorch] = None,
+    dlse: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     arch = _get_device_arch()
+    assert arch // 10 in [9, 10, 11, 12], "Unsupported compute capability. Supported: 9.x, 10.x, 11.x, 12.x"
+    sparse_q = None
+    if block_sparse_tensors is not None and arch // 10 == 9:
+        sparse_q = block_sparse_tensors.block_size[0] if block_sparse_tensors.block_size is not None else 128
     num_head, head_dim = q.shape[-2:]
+    head_dim_v = v.shape[-1]
+    causal, local, window_size_left, window_size_right = _resolve_causal_local_window(
+        causal, window_size_left, window_size_right
+    )
+    if arch // 10 == 12:
+        # SM120: uses SM80 MMA with 99 KB SMEM, 128 threads (4 warps).
+        m_block_size = 64
+        n_block_size = 64
+        if head_dim <= 64:
+            num_stages_Q = 2
+            num_stages_dO = 2
+        else:
+            num_stages_Q = 1
+            num_stages_dO = 1
+        SdP_swapAB = False
         dKV_swapAB = False
+        dQ_swapAB = False
+        AtomLayoutMSdP = 4
+        AtomLayoutNdKV = 4
+        AtomLayoutMdQ = 4
+        V_in_regs = False
+        cluster_size = 1
+        use_2cta_instrs = False
+        num_threads = 128
+        assert not (block_sparse_tensors is not None), "Block sparsity backward not supported on SM 12.0"
+        assert score_mod is None and score_mod_bwd is None, "score_mod backward not supported on SM 12.0"
+        assert mask_mod is None, "mask_mod backward not supported on SM 12.0"
+        assert deterministic is False, "deterministic backward not supported on SM 12.0"
+    elif arch // 10 == 9:
+        cfg = _tile_size_bwd_sm90(
+            head_dim,
+            head_dim_v,
+            causal,
+            local,
+            sparse_block_size_q=sparse_q,
+        )
+        m_block_size = cfg.m_block_size
+        n_block_size = cfg.n_block_size
+        num_stages_Q = cfg.num_stages_Q
+        num_stages_dO = cfg.num_stages_dO
+        num_stages_PdS = cfg.num_stages_PdS
+        SdP_swapAB = cfg.SdP_swapAB
+        dKV_swapAB = cfg.dKV_swapAB
+        dQ_swapAB = cfg.dQ_swapAB
+        AtomLayoutMSdP = cfg.AtomLayoutMSdP
+        AtomLayoutNdKV = cfg.AtomLayoutNdKV
+        AtomLayoutMdQ = cfg.AtomLayoutMdQ
+        num_threads = (cfg.num_wg + 1) * 128
+        dQ_single_wg = cfg.dQ_single_wg
         cluster_size = 1
         use_2cta_instrs = False
         is_varlen = (
             cu_seqlens_q is not None
             or cu_seqlens_k is not None
             or seqused_q is not None
             or seqused_k is not None
         )
     else:
         m_block_size = 128
         n_block_size = 128
         dKV_swapAB = False
         AtomLayoutMdQ = 1
         AtomLayoutNdKV = 1
+        requested_disable_2cta = utils._get_disable_2cta_default()
         disable_2cta = (
+            requested_disable_2cta
             or score_mod is not None
             or score_mod_bwd is not None
             or mask_mod is not None
+            or block_sparse_tensors is not None
         )
         cluster_size = 2 if head_dim >= 128 and not disable_2cta else 1
         use_2cta_instrs = cluster_size==2
     q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = [
         maybe_contiguous(t)
         for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
         seqlen_k = max_seqlen_k if max_seqlen_k is not None else total_k
     num_head_kv = k.shape[-2]
     use_block_sparsity = block_sparse_tensors is not None
+    subtile_factor = sparse_q // m_block_size if sparse_q is not None else 2
     seqlen_q_rounded = (seqlen_q + m_block_size - 1) // m_block_size * m_block_size
     seqlen_k_rounded = (seqlen_k + n_block_size - 1) // n_block_size * n_block_size
     num_n_blocks = seqlen_k_rounded // n_block_size
         if t is not None:
             assert t.dtype == torch.int32, "cu_seqlens_q, cu_seqlens_k must be int32"
     assert lse.dtype == torch.float32, "lse must be float32"
+    if dlse is not None:
+        dlse = maybe_contiguous(dlse)
+    if not is_fake_mode():
+        assert all(
+            t is None or t.is_cuda for t in (q, k, v, out, dout, lse, cu_seqlens_q, cu_seqlens_k)
+        ), "inputs must be on CUDA device"
     assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
     alignment = 16 // q.element_size()
+    if arch // 10 != 12:
+        _validate_head_dims(head_dim, head_dim_v, arch // 10, alignment)
     if softmax_scale is None:
         softmax_scale = 1.0 / math.sqrt(head_dim)
     qhead_per_kvhead = num_head // num_head_kv
         pack_gqa = qhead_per_kvhead > 1
     # pack_gqa backward not yet supported in bwd
     pack_gqa = False
     if score_mod is not None:
         assert score_mod_bwd is not None, "score_mod_bwd is required when score_mod is provided"
         assert softcap == 0.0, "softcap and score_mod are mutually exclusive (different log2 scaling)"
         dpsum = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
         lse_log2 = torch.empty(num_head, total_q_rounded_padded, dtype=torch.float32, device=device)
+    # GQA (qhead_per_kvhead > 1) needs dK/dV accum+postprocess since multiple Q heads
+    # accumulate into the same dK/dV. SM90 varlen_k with qhead_per_kvhead==1 now uses
+    # ragged TMA tensors for direct store, so no longer needs accum+postprocess.
     dKV_postprocess = qhead_per_kvhead > 1
     if dKV_postprocess:
         head_dim_v_rounded = (head_dim_v + 32 - 1) // 32 * 32
             )
     dtype = torch2cute_dtype_map[q.dtype]
+    current_stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
     if deterministic:
+        dQ_semaphore = torch.zeros(batch_size, num_head, seqlen_q_rounded // m_block_size, cluster_size, dtype=torch.int32, device=device)
     else:
         dQ_semaphore = None
     if deterministic and qhead_per_kvhead > 1:
+        dK_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device=device)
+        dV_semaphore = torch.zeros(batch_size, num_head_kv, seqlen_k_rounded // n_block_size, 2, dtype=torch.int32, device=device)
     else:
         dK_semaphore = None
         dV_semaphore = None
+    # Preprocess kernel: compute (o * dout).sum(dim=-1) - dLSE, lse * log2_e, and zero out dq_accum.
+    _bwd_preprocess(
+        out, dout, dpsum, lse, lse_log2, dq_accum,
+        cu_seqlens_q, seqused_q, dlse,
+        dtype, head_dim, head_dim_v, m_block_size,
     )
+    # num_threads: SM90 derives from BwdConfig.num_wg, SM120 is set to 128 above,
+    # SM100/SM110 uses default from function signature (384).
+    if arch // 10 not in [9, 12]:
+        num_threads = 384
     # Backward kernel: compute dk, dv, dq_accum.
     score_mod_hash = utils.hash_callable(score_mod) if score_mod else False
             subtile_factor=subtile_factor,
         )
+    if arch // 10 in [8, 9, 12]:
         compile_key = (
             arch,
             dtype,
             head_dim_v,
             qhead_per_kvhead,
             causal,
+            window_size_left is not None,
+            window_size_right is not None,
             softcap != 0.0,
             m_block_size,
             n_block_size,
             AtomLayoutNdKV,
             AtomLayoutMdQ,
             V_in_regs,
+            dQ_single_wg,
+            deterministic,
             cu_seqlens_q is None,
             cu_seqlens_k is None,
             seqused_q is None,
             if t is not None else None
             for t in (dQ_semaphore, dK_semaphore, dV_semaphore)
         ]
+        if arch // 10 in [8, 12]:
+            flash_bwd_obj_cls = FlashAttentionBackwardSm120 if arch // 10 == 12 else FlashAttentionBackwardSm80
+            fa_bwd_obj = flash_bwd_obj_cls(
                 dtype,
                 head_dim,
                 head_dim_v,
                 qhead_per_kvhead,
                 m_block_size,
                 n_block_size,
                 num_stages_Q,
                 num_stages_dO,
+                num_threads,
+                pack_gqa,
+                causal,
                 SdP_swapAB,
                 dKV_swapAB,
                 dQ_swapAB,
                 AtomLayoutMSdP,
                 AtomLayoutNdKV,
                 AtomLayoutMdQ,
+                V_in_regs=V_in_regs,
+            )
+        elif arch // 10 == 9:
+            fa_bwd_obj = FlashAttentionBackwardSm90(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                causal,
+                is_local=local,
+                deterministic=deterministic,
+                tile_m=m_block_size,
+                tile_n=n_block_size,
+                Q_stage=num_stages_Q,
+                dO_stage=num_stages_dO,
+                PdS_stage=num_stages_PdS,
+                SdP_swapAB=SdP_swapAB,
+                dKV_swapAB=dKV_swapAB,
+                dQ_swapAB=dQ_swapAB,
+                AtomLayoutMSdP=AtomLayoutMSdP,
+                AtomLayoutNdKV=AtomLayoutNdKV,
+                AtomLayoutMdQ=AtomLayoutMdQ,
+                num_threads=num_threads,
                 V_in_regs=V_in_regs,
                 score_mod=score_mod,
                 score_mod_bwd=score_mod_bwd,
                 mask_mod=mask_mod,
                 has_aux_tensors=aux_tensors is not None,
                 subtile_factor=subtile_factor,
+                dQ_single_wg=dQ_single_wg,
             )
         else:
             fa_bwd_obj = FlashAttentionBackwardSm100(
             dk_tensor if not dKV_postprocess else dk_accum_tensor,
             dv_tensor if not dKV_postprocess else dv_accum_tensor,
             softmax_scale,
             cu_seqlens_q_tensor,
             cu_seqlens_k_tensor,
             seqused_q_tensor,
             dV_semaphore_tensor,
             cute_aux_tensors,
             sparse_tensors_compile,
+            current_stream,
             options="--enable-tvm-ffi",
         )
     if not is_fake_mode():
             dk if not dKV_postprocess else dk_accum,
             dv if not dKV_postprocess else dv_accum,
             softmax_scale,
             cu_seqlens_q,
             cu_seqlens_k,
             seqused_q,
             normalized_block_sparse_tensors[:4] if normalized_block_sparse_tensors is not None else None,
         )
+    if arch // 10 == 9:
+        # dQ postprocess: match main kernel's MMA WG count, unless dQ_single_wg
+        num_threads_post_dQ = 128 if dQ_single_wg else cfg.num_wg * 128
+        num_threads_post_dKV = cfg.num_wg * 128
+    else:
+        num_threads_post_dQ = 128
+        num_threads_post_dKV = 128
+    # Postprocess: convert dq_accum from float32 to dq in bf16/fp16
+    _bwd_postprocess_convert(
+        dq_accum, dq, softmax_scale,
+        cu_seqlens_q, seqused_q,
+        arch, dtype, head_dim, m_block_size, num_threads_post_dQ,
+        AtomLayoutMdQ, dQ_swapAB,
+        use_2cta_instrs=use_2cta_instrs, cluster_size=1,
     )
     if dKV_postprocess:
+        # Postprocess: convert dk_accum from float32 to dk in bf16/fp16
+        _bwd_postprocess_convert(
+            dk_accum, dk, softmax_scale,
+            cu_seqlens_k, seqused_k,
+            arch, dtype, head_dim, n_block_size, num_threads_post_dKV,
+            AtomLayoutNdKV, dKV_swapAB,
+            cluster_size=cluster_size,
         )
+        # Postprocess: convert dv_accum from float32 to dv in bf16/fp16
+        _bwd_postprocess_convert(
+            dv_accum, dv, 1.0,
+            cu_seqlens_k, seqused_k,
+            arch, dtype, head_dim_v, n_block_size, num_threads_post_dKV,
+            AtomLayoutNdKV, dKV_swapAB,
+            cluster_size=cluster_size,
         )
     return dq, dk, dv
 _flash_attn_bwd.compile_cache = get_jit_cache("bwd")
 class FlashAttnFunc(torch.autograd.Function):
         ctx.window_size = window_size
         ctx.softcap = softcap
         ctx.deterministic = deterministic
+        ctx.return_lse = return_lse
+        ctx.set_materialize_grads(False)
         return out, lse
     @staticmethod
+    def backward(ctx, dout, dlse):
         q, k, v, out, lse = ctx.saved_tensors
+        if not ctx.return_lse:
+            dlse = None
+        if dout is None:
+            dout = torch.zeros_like(out)
         dq, dk, dv = _flash_attn_bwd(
             q,
             k,
             window_size_left=ctx.window_size[0],
             window_size_right=ctx.window_size[1],
             deterministic=ctx.deterministic,
+            dlse=dlse,
         )
         return dq, dk, dv, *((None,) * 20)  # Extra Nones is fine
         ctx.deterministic = deterministic
         ctx.max_seqlen_q = max_seqlen_q
         ctx.max_seqlen_k = max_seqlen_k
+        ctx.return_lse = return_lse
+        ctx.set_materialize_grads(False)
         return out, lse
     @staticmethod
+    def backward(ctx, dout, dlse):
         q, k, v, out, lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k = ctx.saved_tensors
         assert ctx.softcap == 0.0
+        if not ctx.return_lse:
+            dlse = None
+        if dout is None:
+            dout = torch.zeros_like(out)
         dq, dk, dv = _flash_attn_bwd(
             q,
             k,
             max_seqlen_q=ctx.max_seqlen_q,
             max_seqlen_k=ctx.max_seqlen_k,
             deterministic=ctx.deterministic,
+            dlse=dlse,
         )
         return dq, dk, dv, *((None,) * 20)
     )
+def _compile_fwd_combine(
+    dtype, dtype_partial, head_dim, tile_m, k_block_size, log_max_splits,
+    has_cu_seqlens, has_seqused, has_lse, has_varlen_batch_idx,
+):
+    """Compile fwd combine kernel using cute fake tensors (no real GPU tensors needed)."""
+    sym = cute.sym_int
+    div = 128 // dtype_partial.width  # 16-byte alignment in elements
+    fa_combine = FlashAttentionForwardCombine(
+        dtype=dtype,
+        dtype_partial=dtype_partial,
+        head_dim=head_dim,
+        tile_m=tile_m,
+        k_block_size=k_block_size,
+        log_max_splits=log_max_splits,
+    )
+    if not fa_combine.can_implement(
+        dtype, dtype_partial, head_dim, tile_m, k_block_size, log_max_splits,
+        num_threads=256,
+    ):
+        raise RuntimeError(
+            "FlashAttention combine kernel cannot be implemented with given parameters"
+        )
+    if has_cu_seqlens:
+        # Varlen: (num_splits, total_q, nheads, headdim)
+        num_splits, total_q, nheads = sym(), sym(), sym()
+        mO_partial = fake_tensor(dtype_partial, (num_splits, total_q, nheads, head_dim), divisibility=div)
+        mLSE_partial = fake_tensor(Float32, (num_splits, total_q, nheads), divisibility=1, leading_dim=1)
+        mO = fake_tensor(dtype, (total_q, nheads, head_dim), divisibility=div)
+        mLSE = fake_tensor(Float32, (total_q, nheads), divisibility=1, leading_dim=0) if has_lse else None
+    else:
+        # Batched: (num_splits, batch, seqlen, nheads, headdim)
+        num_splits, batch, seqlen, nheads = sym(), sym(), sym(), sym()
+        mO_partial = fake_tensor(dtype_partial, (num_splits, batch, seqlen, nheads, head_dim), divisibility=div)
+        mLSE_partial = fake_tensor(Float32, (num_splits, batch, seqlen, nheads), divisibility=1, leading_dim=2)
+        mO = fake_tensor(dtype, (batch, seqlen, nheads, head_dim), divisibility=div)
+        mLSE = fake_tensor(Float32, (batch, seqlen, nheads), divisibility=1, leading_dim=1) if has_lse else None
+        batch = mO_partial.shape[1]
+    batch_for_1d = batch if not has_cu_seqlens else sym()
+    batchp1 = sym()
+    mCuSeqlens = fake_tensor(Int32, (batchp1,), divisibility=1) if has_cu_seqlens else None
+    mSeqused = fake_tensor(Int32, (batch_for_1d,), divisibility=1) if has_seqused else None
+    mNumSplitsDynamic = None  # Not parametrized in compile_key
+    mVarlenBatchIdx = fake_tensor(Int32, (batch_for_1d,), divisibility=1) if has_varlen_batch_idx else None
+    mSemaphore = None  # Not parametrized in compile_key
+    return cute.compile(
+        fa_combine,
+        mO_partial, mLSE_partial, mO, mLSE,
+        mCuSeqlens, mSeqused, mNumSplitsDynamic, mVarlenBatchIdx, mSemaphore,
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
 def _flash_attn_fwd_combine(
     out_partial: torch.Tensor,
     lse_partial: torch.Tensor,
     cu_seqlens: Optional[torch.Tensor] = None,
     seqused: Optional[torch.Tensor] = None,
     num_splits_dynamic_ptr: Optional[torch.Tensor] = None,
+    varlen_batch_idx: Optional[torch.Tensor] = None,
     semaphore_to_reset: Optional[torch.Tensor] = None,
 ) -> None:
     """Forward combine kernel for split attention computation.
     Returns:
         None
     """
     assert out_partial.dtype in [torch.float16, torch.bfloat16, torch.float32], (
         "out_partial must be fp16, bf16, or fp32"
     )
+    if not is_fake_mode():
+        assert out_partial.is_cuda and lse_partial.is_cuda, "tensors must be on CUDA device"
     # Determine if this is variable length based on dimensions
     is_varlen = out_partial.dim() == 4
     # Validate optional tensors
     for t, name in [
         (cu_seqlens, "cu_seqlens"),
         (num_splits_dynamic_ptr, "num_splits_dynamic_ptr"),
     ]:
         if t is not None:
+            if not is_fake_mode():
+                assert t.is_cuda, f"{name} must be on CUDA device"
             assert t.is_contiguous(), f"{name} must be contiguous"
     head_dim = out_partial.shape[-1]
     num_splits = out_partial.shape[0]
     assert num_splits <= 256
     k_block_size = 64 if head_dim <= 64 else 128
     # We want kBlockM to be as small as possible to maximize parallelism.
     # E.g., if hdim is 64, we want kBlockM to be 16 so that we can use 256 threads, each reading 4 elements (floats).
+    tile_m = 8 if k_block_size % 128 == 0 else (16 if k_block_size % 64 == 0 else 32)
     log_max_splits = max(math.ceil(math.log2(num_splits)), 4)
+    if tile_m == 8:
         # If kBlockM == 8 then the minimum number of splits is 32.
         # TODO: we can deal w this by using 128 threads instead
         log_max_splits = max(log_max_splits, 5)
     # Create combine kernel configuration
     dtype = torch2cute_dtype_map[out.dtype]
     dtype_partial = torch2cute_dtype_map[out_partial.dtype]
     compile_key = (
         dtype,
         dtype_partial,
         head_dim,
+        tile_m,
         k_block_size,
         log_max_splits,
         cu_seqlens is not None,
         seqused is not None,
         lse is not None,
+        varlen_batch_idx is not None,
     )
     if compile_key not in _flash_attn_fwd_combine.compile_cache:
+        _flash_attn_fwd_combine.compile_cache[compile_key] = _compile_fwd_combine(
+            *compile_key
         )
     if not is_fake_mode():
         _flash_attn_fwd_combine.compile_cache[compile_key](
+            out_partial, lse_partial, out, lse,
+            cu_seqlens, seqused, num_splits_dynamic_ptr, varlen_batch_idx,
             semaphore_to_reset,
         )
     out_dtype: Optional[torch.dtype] = None,
     cu_seqlens: Optional[torch.Tensor] = None,
     seqused: Optional[torch.Tensor] = None,
+    varlen_batch_idx: Optional[torch.Tensor] = None,
     return_lse: bool = True,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Flash Attention combine function for split attention computation.
         out_dtype: Optional output dtype. If None, will use fp16/bf16 based on input.
         cu_seqlens: Cumulative sequence lengths for variable length sequences
         seqused: Used sequence lengths for each batch
+        varlen_batch_idx: Optional mapping from virtual batch index to real batch index
+            (int32 tensor of shape (batch_size,)). Used by persistent tile schedulers
+            that reorder batch processing for load balancing.
         return_lse: Whether to return the combined LSE tensor. Default is True.
     Returns:
     """
     # Input validation
     assert out_partial.dim() in [4, 5], "out_partial must have 4 or 5 dimensions"
     # Determine if this is variable length based on dimensions
     is_varlen = out_partial.dim() == 4
     if is_varlen:
         # Variable length: (num_splits, total_q, num_heads, head_size)
         num_splits, total_q, num_heads, head_size = out_partial.shape
         batch_size = 1  # Treat as single batch for varlen
         seqlen = total_q
     else:
         # Regular batched: (num_splits, batch_size, seqlen, num_heads, head_size)
         num_splits, batch_size, seqlen, num_heads, head_size = out_partial.shape
     # Determine output dtype
     if out_dtype is None:
         out_dtype = out_partial.dtype
     # Create output if not provided
     device = out_partial.device
     if out is None:
             out = torch.empty(
                 batch_size, seqlen, num_heads, head_size, dtype=out_dtype, device=device
             )
     # Create lse output only if requested
     if return_lse:
         if is_varlen:
+            lse = torch.empty(num_heads, total_q, dtype=torch.float32, device=device)
         else:
+            lse = torch.empty(batch_size, num_heads, seqlen, dtype=torch.float32, device=device)
+        lse = lse.transpose(-1, -2)
     else:
         lse = None
     _flash_attn_fwd_combine(
         out_partial,
         lse_partial,
         lse,
         cu_seqlens,
         seqused,
+        varlen_batch_idx=varlen_batch_idx,
     )
     return out, lse

build/torch-cuda/mask.py CHANGED Viewed

@@ -1,109 +1,102 @@
 # Copyright (c) 2025, Tri Dao.
-from typing import Optional, Callable
 from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32, Int32, const_expr
 from .quack import layout_utils
-from . import utils
 from .seqlen_info import SeqlenInfoQK
 @cute.jit
-def mask_r2p(X: cute.Tensor, col_limit: Int32, arch: int = 90, rank1: bool = False) -> None:
-    # Bit manipulation, compiles down to the R2P instruction
-    # For sm100: we know that tScS_t2r[i][1] == i, for the particular tmem copy atom we're using.
-    # For sm90: instead of comparing limit to 0, 1, 8, 9, 16, 17, ...,
-    # we compare a transformed version of limit to 0, 1, 2, 3, 4, 5, ...
-    if const_expr(arch == 90):
-        col_limit_transformed = col_limit // 8 * 2 + min(col_limit % 8, 2)
-    else:
-        col_limit_transformed = col_limit
-    ncol = const_expr(cute.size(X.shape[cute.rank(X) - 1]) if not rank1 else cute.size(X.shape))
-    # Ideally we'd move by 32 instead of 24, but mask >> i isn't correct for i == 31
-    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
-        # Don't need to clamp to 32 since the shr.u32 instruction does that already
-        col_limit_right_s = max(col_limit_transformed - s * 24, 0)
-        # 0 -> 0b00...00, 1 -> 0b00...01, ..., 31 -> 0b01...11, 32 -> 0b11...11
-        mask = (1 << col_limit_right_s) - 1
-        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
-        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
-            in_bound = cutlass.Boolean(mask & (1 << i))
-            c = s * 24 + i
-            if const_expr(rank1):
-                X[c] = X[c] if in_bound else -Float32.inf
-                # This is the equivalent of:
-                # X[s * 24 + i] = X[s * 24 + i] if col_limit_right_s <= i else -Float32.inf
-            else:
-                for r in cutlass.range_constexpr(cute.size(X.shape[0])):
-                    X[r, c] = X[r, c] if in_bound else -Float32.inf
 @cute.jit
-def mask_r2p_transposed(X: cute.Tensor, row_limit_top: Int32, num_rep: int) -> None:
-    # Bit manipulation, compiles down to the R2P instruction
-    # For sm100: we know that tScS_t2r[i][0] has the form 0, 1, ..., 31, 64, ..., 127
-    # or 0, 1, ..., 15, 32, ..., 47, 64, ...
-    # We compare a transformed version of limit to 0, 1, 2, 3, 4, 5, ...
-    # Here we hardcode for the case of 2 warp groups.
-    num_wg = 2
-    row_limit_top_transformed = row_limit_top // (num_rep * num_wg) * num_rep + min(
-        row_limit_top % (num_rep * num_wg), num_rep
-    )
-    ncol = cute.size(X.shape)
-    # Ideally we'd move by 32 instead of 24, but mask >> i isn't correct for i == 31
-    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
-        row_limit_top_s = max(row_limit_top_transformed - s * 24, 0)
-        # 0 -> 0b00...00, 1 -> 0b00...01, ..., 31 -> 0b01...11, 32 -> 0b11...11
-        mask = (1 << row_limit_top_s) - 1
-        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
-        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
-            out_bound = cutlass.Boolean(mask & (1 << i))
-            c = s * 24 + i
-            X[c] = -Float32.inf if out_bound else X[c]
-            # tidx = cute.arch.thread_idx()[0] % 256
-            # if tidx == 128:
-            #     cute.printf("tidx = {}, s = {}, i = {}, row_limit_top = {}, row_limit_top_s = {}, mask = {}, out_bound = {}", tidx, s, i, row_limit_top, row_limit_top_s, mask, out_bound)
 @cute.jit
-def mask_r2p_dual_bound(
     X: cute.Tensor,
-    col_limit_left: Int32,  # Inclusive lower bound
-    col_limit_right: Int32,  # Exclusive upper bound
 ) -> None:
-    """
-    Dual-bound masking using two bitmasks for SM100, following mask_r2p.
-    Masks elements where: NOT (col_limit_left <= col < col_limit_right)
-    Uses bit manipulation to create a range mask:
-        mask_right = (1 << right) - 1  -> bits (right-1)..0 are 1
-        mask_left  = (1 << left) - 1   -> bits (left-1)..0 are 1
-        mask_range = mask_range = mask_right & ~ mask_left -> bits (right-1)..left are 1
     """
-    ncol = const_expr(cute.size(X.shape))
-    for s in cutlass.range_constexpr(cute.ceil_div(ncol, 24)):
-        right_s = max(col_limit_right - s * 24, 0)
-        left_s = max(col_limit_left - s * 24, 0)
-        # otherwise cute dsl complains about python int too large to convert into c long
-        right_s = min(right_s, 24)
-        left_s = min(left_s, 24)
-        # bits (right-1)..left are 1
-        mask_right = (1 << right_s) - 1
-        mask_left = (1 << left_s) - 1
-        mask_range = mask_right & ~mask_left
-        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
-        for i in cutlass.range_constexpr(min(24, ncol - s * 24)):
-            in_bound = cutlass.Boolean(mask_range & (1 << i))
-            c = s * 24 + i
-            X[c] = X[c] if in_bound else -Float32.inf
 @dataclass(frozen=True)
@@ -161,8 +154,7 @@ class AttentionMask:
         seqlenk_col_limit = self.seqlen_k - n_block * self.tile_n - thr_col_offset
         if const_expr(not mask_causal and not mask_local and mask_mod is None):
             if const_expr(mask_seqlen):
-                # The compiler now choses not to use R2P
-                r2p = const_expr(False and not self.swap_AB)
                 if const_expr(not r2p):
                     # traverse column index.
                     for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
@@ -170,7 +162,8 @@ class AttentionMask:
                         for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                             acc_S_mn[r, c] = -Float32.inf if oob else acc_S_mn[r, c]
                 else:
-                    mask_r2p(acc_S_mn, seqlenk_col_limit, arch=90)
         elif const_expr(
             not mask_causal and not mask_local and mask_mod is not None
@@ -272,7 +265,12 @@ class AttentionMask:
                                     else acc_S_mn[r, c]
                                 )
                         else:
-                            mask_r2p(acc_S_mn[r, None], col_limit_right, arch=90, rank1=True)
                 else:  # Local
                     local_row_offset_right = (
                         causal_row_offset + self.window_size_right
@@ -284,6 +282,7 @@ class AttentionMask:
                         if const_expr(self.window_size_left is not None)
                         else None
                     )
                     for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                         if const_expr(self.qhead_per_kvhead_packgqa == 1):
                             row_idx = tScS_mn[r, 0][0] + m_block * self.tile_m
@@ -302,13 +301,22 @@ class AttentionMask:
                             if const_expr(self.window_size_left is not None)
                             else 0
                         )
-                        # if cute.arch.thread_idx()[0] == 128: cute.printf("n_block = {}, r = {}, row_idx = {}, causal_row_offset = {}, col_limit_right = {}, col_limit_left = {}", n_block, r, row_idx, causal_row_offset, col_limit_right, col_limit_left)
-                        # traverse column index.
-                        for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
-                            col_idx = t0ScS_mn[0, c][1]
-                            # only consider the column index, so the row index sets to 0.
-                            if col_idx >= col_limit_right or col_idx < col_limit_left:
-                                acc_S_mn[r, c] = -Float32.inf
             else:  # swap_AB
                 assert self.qhead_per_kvhead_packgqa == 1
                 thr_row_offset = tScS_mn[0][ROW]
@@ -338,11 +346,18 @@ class AttentionMask:
                         # column, by setting row limit to be self.tile_m.
                         row_limit_top = (
                             self.tile_m
-                            if col0 >= seqlenk_col_limit
-                            else col0 - causal_row_offset - self.window_size_right
                         )
-                        # TODO: do we need col_limit_sink?
-                        row_limit_bot = col0 - causal_row_offset + self.window_size_left
                         for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                             row_idx = t0ScS_mn[r, 0][ROW]
                             acc_S_mn[r, c] = (
@@ -392,7 +407,11 @@ class AttentionMask:
                         # For some reason the 2 lines above generate really bad SASS
                         acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= seqlenk_col_limit else acc_S[i]
                 else:
-                    mask_r2p(acc_S, seqlenk_col_limit, arch=100, rank1=True)
         elif const_expr(not mask_causal and not mask_local and mask_mod is not None):
             # Block sparse case w/ mask_mod
@@ -445,12 +464,12 @@ class AttentionMask:
                     acc_S[i] = -Float32.inf if mask_row >= self.seqlen_q else acc_S[i]
         else:  # Causal or local
-            causal_row_offset = 1 + self.seqlen_k - n_block * self.tile_n - self.seqlen_q
             row_idx = tScS_t2r[0][0] + m_block * self.tile_m
             if const_expr(self.qhead_per_kvhead_packgqa != 1):
                 row_idx = row_idx // self.qhead_per_kvhead_packgqa
             if const_expr(mask_causal):
-                col_limit_right = row_idx + causal_row_offset
                 if const_expr(mask_seqlen):
                     col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
                 # if cute.arch.thread_idx()[0] % 32 == 0:
@@ -460,15 +479,19 @@ class AttentionMask:
                     for i in cutlass.range(ncol, unroll_full=True):
                         acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= col_limit_right else acc_S[i]
                 else:
-                    mask_r2p(acc_S, col_limit_right, arch=100, rank1=True)
             else:
                 local_row_offset_right = (
-                    causal_row_offset + self.window_size_right
                     if const_expr(self.window_size_right is not None)
                     else None
                 )
                 local_row_offset_left = (
-                    causal_row_offset - 1 - self.window_size_left
                     if const_expr(self.window_size_left is not None)
                     else None
                 )
@@ -493,8 +516,15 @@ class AttentionMask:
                             else acc_S[i]
                         )
                 else:
-                    # XOR-based R2P dual bound masking
-                    mask_r2p_dual_bound(acc_S, col_limit_left, col_limit_right)
     @cute.jit
     def apply_mask_sm100_transposed(
@@ -634,7 +664,13 @@ class AttentionMask:
                         )
                 else:
                     num_rep = cute.size(tScS_t2r, mode=[0])  # 16 or 32
-                    mask_r2p_transposed(acc_S, row_limit_top, num_rep)
             else:
                 if const_expr(self.window_size_right is not None):
                     row_limit_top = causal_offset - self.window_size_right
@@ -645,9 +681,31 @@ class AttentionMask:
                 if const_expr(mask_seqlen):
                     if seqlenk_col_limit <= 0:
                         row_limit_top = self.tile_m
-                for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
-                    row_idx = t0ScS_t2r[i][ROW]
-                    local_mask = row_idx < row_limit_top
-                    if const_expr(self.window_size_left is not None):
-                        local_mask |= row_idx > row_limit_bot
-                    acc_S[i] = -cutlass.Float32.inf if local_mask else acc_S[i]

 # Copyright (c) 2025, Tri Dao.
+from typing import Optional, Callable, TypeAlias
 from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
+from cutlass import Float32, Int32, Uint32, const_expr
 from .quack import layout_utils
+from . import utils as utils
 from .seqlen_info import SeqlenInfoQK
+MaskGenFn: TypeAlias = Callable[[int], Uint32]
+MASK_R2P_CHUNK_SIZE: int = 32
 @cute.jit
+def r2p_bitmask_below(limit: Int32, s: int) -> Uint32:
+    """32-bit R2P bitmask keeping positions < limit (exclusive upper bound).
+    Positions 0..limit-1 in chunk `s` get bit=1 (keep), the rest bit=0 (mask).
+    Uses inline PTX to avoid shift-by-type-width UB.
+    """
+    m = max((s + 1) * MASK_R2P_CHUNK_SIZE - limit, 0)
+    return utils.shr_u32(Uint32(0xFFFFFFFF), Uint32(m))
 @cute.jit
+def r2p_bitmask_above(limit: Int32, s: int) -> Uint32:
+    """32-bit R2P bitmask keeping positions >= limit (inclusive lower bound).
+    Positions limit..31 in chunk `s` get bit=1 (keep), the rest bit=0 (mask).
+    Uses inline PTX to avoid shift-by-type-width UB.
+    """
+    n = max(limit - s * MASK_R2P_CHUNK_SIZE, 0)
+    return utils.shl_u32(Uint32(0xFFFFFFFF), Uint32(n))
 @cute.jit
+def mask_r2p_lambda(
     X: cute.Tensor,
+    mask_gen_fn: cutlass.Constexpr[MaskGenFn],
+    rank1: bool = False,
 ) -> None:
+    """Apply R2P masking with a custom bitmask generator.
+    mask_gen_fn(chunk_idx: constexpr int) -> Uint32:
+        Returns a 32-bit bitmask for the chunk. Bit i set means column
+        chunk_idx * chunk_size + i is KEPT; bit i clear means masked to -inf.
     """
+    ncol = const_expr(cute.size(X.shape[cute.rank(X) - 1]) if not rank1 else cute.size(X.shape))
+    # 32-column chunks. The mask_gen_fn returns a Uint32 bitmask (1=keep).
+    CHUNK_SIZE = MASK_R2P_CHUNK_SIZE
+    for s in cutlass.range_constexpr(cute.ceil_div(ncol, CHUNK_SIZE)):
+        mask = mask_gen_fn(s)
+        # This needs to be range_constexpr, o/w the compiler can't generate the R2P instruction
+        for i in cutlass.range_constexpr(min(CHUNK_SIZE, ncol - s * CHUNK_SIZE)):
+            in_bound = cutlass.Boolean(mask & (Uint32(1) << i))
+            c = s * CHUNK_SIZE + i
+            if const_expr(rank1):
+                X[c] = X[c] if in_bound else -Float32.inf
+            else:
+                for r in cutlass.range_constexpr(cute.size(X.shape[0])):
+                    X[r, c] = X[r, c] if in_bound else -Float32.inf
+@cute.jit
+def sm90_col_to_r2p_idx(col_limit: Int32) -> Int32:
+    """Transform SM90 MMA column coordinate to R2P element index.
+    SM90 MMA accumulator column indices are non-contiguous: 0, 1, 8, 9, 16, 17, ...
+    Element indices are contiguous: 0, 1, 2, 3, 4, 5, ...
+    This converts a column-space threshold to element-space for r2p_bitmask_below/above.
+    """
+    return col_limit // 8 * 2 + min(col_limit % 8, 2)
+@cute.jit
+def row_to_r2p_idx(x: Int32, num_rep: int, num_wg: int) -> Int32:
+    """Convert a row coordinate to an R2P element index in the warp-group interleaved layout.
+    In the SM100 backward pass, 2 warp groups share TMEM. The TMEM load atom
+    distributes rows in an interleaved pattern: elements 0..num_rep-1 map to
+    rows 0..num_rep-1 (warp group 0), elements num_rep..2*num_rep-1 map to
+    rows num_rep*num_wg..num_rep*num_wg+num_rep-1 (warp group 1), and so on.
+    Row-coordinate thresholds (causal limits, window bounds, uih_len) must be
+    converted to element indices before use with r2p_bitmask_above/below.
+    Rows not owned by this thread (in the gap between warp groups) are clamped
+    to the boundary element index, which is safe because R2P thresholds are
+    monotonic.
+    Example with num_rep=16, num_wg=2:
+        row  0 -> elem  0,  row 15 -> elem 15,
+        row 16 -> elem 16 (clamped), row 31 -> elem 16 (clamped),
+        row 32 -> elem 16, row 33 -> elem 17, row 47 -> elem 31.
+    """
+    return x // (num_rep * num_wg) * num_rep + min(x % (num_rep * num_wg), num_rep)
 @dataclass(frozen=True)
         seqlenk_col_limit = self.seqlen_k - n_block * self.tile_n - thr_col_offset
         if const_expr(not mask_causal and not mask_local and mask_mod is None):
             if const_expr(mask_seqlen):
+                r2p = const_expr(not self.swap_AB)
                 if const_expr(not r2p):
                     # traverse column index.
                     for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
                         for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                             acc_S_mn[r, c] = -Float32.inf if oob else acc_S_mn[r, c]
                 else:
+                    seqlenk_col_limit_r2p = sm90_col_to_r2p_idx(seqlenk_col_limit)
+                    mask_r2p_lambda(acc_S_mn, lambda s: r2p_bitmask_below(seqlenk_col_limit_r2p, s))
         elif const_expr(
             not mask_causal and not mask_local and mask_mod is not None
                                     else acc_S_mn[r, c]
                                 )
                         else:
+                            col_limit_r2p = sm90_col_to_r2p_idx(col_limit_right)
+                            mask_r2p_lambda(
+                                acc_S_mn[r, None],
+                                lambda s: r2p_bitmask_below(col_limit_r2p, s),
+                                rank1=True,
+                            )
                 else:  # Local
                     local_row_offset_right = (
                         causal_row_offset + self.window_size_right
                         if const_expr(self.window_size_left is not None)
                         else None
                     )
+                    r2p_local = const_expr(not self.swap_AB)
                     for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                         if const_expr(self.qhead_per_kvhead_packgqa == 1):
                             row_idx = tScS_mn[r, 0][0] + m_block * self.tile_m
                             if const_expr(self.window_size_left is not None)
                             else 0
                         )
+                        if const_expr(not r2p_local):
+                            # traverse column index.
+                            for c in cutlass.range(cute.size(tScS_mn.shape[1]), unroll_full=True):
+                                col_idx = t0ScS_mn[0, c][1]
+                                if col_idx >= col_limit_right or col_idx < col_limit_left:
+                                    acc_S_mn[r, c] = -Float32.inf
+                        else:
+                            col_limit_right_r2p = sm90_col_to_r2p_idx(col_limit_right)
+                            col_limit_left_r2p = sm90_col_to_r2p_idx(col_limit_left)
+                            def mask_gen_fn(s: int) -> Uint32:
+                                return r2p_bitmask_below(
+                                    col_limit_right_r2p, s
+                                ) & r2p_bitmask_above(col_limit_left_r2p, s)
+                            mask_r2p_lambda(acc_S_mn[r, None], mask_gen_fn, rank1=True)
             else:  # swap_AB
                 assert self.qhead_per_kvhead_packgqa == 1
                 thr_row_offset = tScS_mn[0][ROW]
                         # column, by setting row limit to be self.tile_m.
                         row_limit_top = (
                             self.tile_m
+                            if col0 >= seqlenk_col_limit and mask_seqlen
+                            else (
+                                col0 - causal_row_offset - self.window_size_right
+                                if const_expr(self.window_size_right is not None)
+                                else 0
+                            )
+                        )
+                        row_limit_bot = (
+                            col0 - causal_row_offset + self.window_size_left
+                            if const_expr(self.window_size_left is not None)
+                            else self.tile_m
                         )
                         for r in cutlass.range(cute.size(tScS_mn.shape[0]), unroll_full=True):
                             row_idx = t0ScS_mn[r, 0][ROW]
                             acc_S_mn[r, c] = (
                         # For some reason the 2 lines above generate really bad SASS
                         acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= seqlenk_col_limit else acc_S[i]
                 else:
+                    mask_r2p_lambda(
+                        acc_S,
+                        lambda s: r2p_bitmask_below(seqlenk_col_limit, s),
+                        rank1=True,
+                    )
         elif const_expr(not mask_causal and not mask_local and mask_mod is not None):
             # Block sparse case w/ mask_mod
                     acc_S[i] = -Float32.inf if mask_row >= self.seqlen_q else acc_S[i]
         else:  # Causal or local
+            causal_row_offset = self.seqlen_k - n_block * self.tile_n - self.seqlen_q
             row_idx = tScS_t2r[0][0] + m_block * self.tile_m
             if const_expr(self.qhead_per_kvhead_packgqa != 1):
                 row_idx = row_idx // self.qhead_per_kvhead_packgqa
             if const_expr(mask_causal):
+                col_limit_right = row_idx + causal_row_offset + 1
                 if const_expr(mask_seqlen):
                     col_limit_right = cutlass.min(col_limit_right, seqlenk_col_limit)
                 # if cute.arch.thread_idx()[0] % 32 == 0:
                     for i in cutlass.range(ncol, unroll_full=True):
                         acc_S[i] = -Float32.inf if tScS_t2r[i][1] >= col_limit_right else acc_S[i]
                 else:
+                    mask_r2p_lambda(
+                        acc_S,
+                        lambda s: r2p_bitmask_below(col_limit_right, s),
+                        rank1=True,
+                    )
             else:
                 local_row_offset_right = (
+                    causal_row_offset + 1 + self.window_size_right
                     if const_expr(self.window_size_right is not None)
                     else None
                 )
                 local_row_offset_left = (
+                    causal_row_offset - self.window_size_left
                     if const_expr(self.window_size_left is not None)
                     else None
                 )
                             else acc_S[i]
                         )
                 else:
+                    # Dual-bound R2P masking for SM100.
+                    # Masks elements where: NOT (col_limit_left <= col < col_limit_right)
+                    def mask_gen_fn(s: int) -> Uint32:
+                        return r2p_bitmask_below(col_limit_right, s) & r2p_bitmask_above(
+                            col_limit_left, s
+                        )
+                    mask_r2p_lambda(acc_S, mask_gen_fn, rank1=True)
     @cute.jit
     def apply_mask_sm100_transposed(
                         )
                 else:
                     num_rep = cute.size(tScS_t2r, mode=[0])  # 16 or 32
+                    num_wg = 2
+                    row_limit = row_to_r2p_idx(row_limit_top, num_rep, num_wg)
+                    mask_r2p_lambda(
+                        acc_S,
+                        lambda s: r2p_bitmask_above(row_limit, s),
+                        rank1=True,
+                    )
             else:
                 if const_expr(self.window_size_right is not None):
                     row_limit_top = causal_offset - self.window_size_right
                 if const_expr(mask_seqlen):
                     if seqlenk_col_limit <= 0:
                         row_limit_top = self.tile_m
+                r2p = True
+                if const_expr(not r2p):
+                    for i in cutlass.range(cute.size(acc_S.shape), unroll_full=True):
+                        row_idx = t0ScS_t2r[i][ROW]
+                        local_mask = row_idx < row_limit_top
+                        if const_expr(self.window_size_left is not None):
+                            local_mask |= row_idx > row_limit_bot
+                        acc_S[i] = -cutlass.Float32.inf if local_mask else acc_S[i]
+                else:
+                    def mask_gen_fn(s: int) -> Uint32:
+                        num_rep = cute.size(tScS_t2r, mode=[0])
+                        num_wg = 2
+                        row_limit = row_to_r2p_idx(row_limit_top, num_rep, num_wg)
+                        mask = r2p_bitmask_above(row_limit, s)
+                        if const_expr(self.window_size_left is not None):
+                            row_limit_bottom = row_to_r2p_idx(row_limit_bot + 1, num_rep, num_wg)
+                            mask = mask & r2p_bitmask_below(row_limit_bottom, s)
+                        return mask
+                    mask_r2p_lambda(
+                        acc_S,
+                        mask_gen_fn,
+                        rank1=True,
+                    )

build/torch-cuda/named_barrier.py CHANGED Viewed

@@ -12,6 +12,19 @@ class NamedBarrierFwd(enum.IntEnum):
     PEmpty = enum.auto()
 class NamedBarrierBwd(enum.IntEnum):
     Epilogue = enum.auto()
     WarpSchedulerWG1 = enum.auto()
@@ -20,8 +33,10 @@ class NamedBarrierBwd(enum.IntEnum):
     PdS = enum.auto()
     dQFullWG0 = enum.auto()
     dQFullWG1 = enum.auto()
     dQEmptyWG0 = enum.auto()
     dQEmptyWG1 = enum.auto()
 class NamedBarrierBwdSm100(enum.IntEnum):

     PEmpty = enum.auto()
+class NamedBarrierFwdSm100(enum.IntEnum):
+    Epilogue = enum.auto()  # starts from 1 as barrier 0 is reserved for sync_threads()
+    TmemPtr = enum.auto()
+    SoftmaxStatsW0 = enum.auto()
+    SoftmaxStatsW1 = enum.auto()
+    SoftmaxStatsW2 = enum.auto()
+    SoftmaxStatsW3 = enum.auto()
+    SoftmaxStatsW4 = enum.auto()
+    SoftmaxStatsW5 = enum.auto()
+    SoftmaxStatsW6 = enum.auto()
+    SoftmaxStatsW7 = enum.auto()
 class NamedBarrierBwd(enum.IntEnum):
     Epilogue = enum.auto()
     WarpSchedulerWG1 = enum.auto()
     PdS = enum.auto()
     dQFullWG0 = enum.auto()
     dQFullWG1 = enum.auto()
+    dQFullWG2 = enum.auto()
     dQEmptyWG0 = enum.auto()
     dQEmptyWG1 = enum.auto()
+    dQEmptyWG2 = enum.auto()
 class NamedBarrierBwdSm100(enum.IntEnum):

build/torch-cuda/pack_gqa.py CHANGED Viewed

@@ -1,25 +1,123 @@
 # Copyright (c) 2025, Tri Dao.
 import cutlass
 import cutlass.cute as cute
 from .quack import layout_utils
-from . import utils
 class PackGQA:
-    def __init__(
-        self,
-        m_block_size: cutlass.Constexpr[int],
-        head_dim_padded: cutlass.Constexpr[int],
-        check_hdim_oob: cutlass.Constexpr[bool],
-        qhead_per_kvhead: cutlass.Constexpr[bool],
-    ):
-        self.m_block_size = m_block_size
-        self.head_dim_padded = head_dim_padded
-        self.check_hdim_oob = check_hdim_oob
-        self.qhead_per_kvhead = qhead_per_kvhead
     @cute.jit
     def compute_ptr(

 # Copyright (c) 2025, Tri Dao.
+from dataclasses import dataclass
+from typing import Union, Tuple
 import cutlass
 import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync
 from .quack import layout_utils
+from . import utils as utils
+def pack_gqa_layout(T, qhead_per_kvhead, nheads_kv, head_idx):
+    """Reshape a tensor to fold qhead_per_kvhead into the seqlen dimension (mode 0).
+    The head dimension is at mode ``head_idx``.  Modes before it (1..head_idx-1)
+    are kept as-is (e.g. headdim for Q/O tensors), and modes after it are kept
+    as-is (e.g. batch).
+    For Q/O tensors (head_idx=2):
+        (seqlen_q, headdim, nheads, batch, ...) -> ((qhead_per_kvhead, seqlen_q), headdim, nheads_kv, batch, ...)
+    For LSE tensors (head_idx=1):
+        (seqlen_q, nheads, batch, ...) -> ((qhead_per_kvhead, seqlen_q), nheads_kv, batch, ...)
+    """
+    head_stride = T.stride[head_idx]
+    shape_packed = (
+        (qhead_per_kvhead, T.shape[0]),
+        *[T.shape[i] for i in range(1, head_idx)],
+        nheads_kv,
+        *[T.shape[i] for i in range(head_idx + 1, len(T.shape))],
+    )
+    stride_packed = (
+        (head_stride, T.stride[0]),
+        *[T.stride[i] for i in range(1, head_idx)],
+        head_stride * qhead_per_kvhead,
+        *[T.stride[i] for i in range(head_idx + 1, len(T.shape))],
+    )
+    return cute.make_tensor(T.iterator, cute.make_layout(shape_packed, stride=stride_packed))
+def make_packgqa_tiled_tma_atom(
+    op: cute.atom.CopyOp,
+    gmem_tensor: cute.Tensor,
+    smem_layout: Union[cute.Layout, cute.ComposedLayout],
+    cta_tiler: Tuple[int, int],
+    qhead_per_kvhead: int,
+    head_idx: int,
+):
+    # This packing and unpacking of the layout is so that we keep the same TMA dimension as usual.
+    # e.g. for (seqlen, d, nheads, b) layout, we still have 4D TMA after packing to
+    # ((nheads, seqlen), d, b).
+    # If we instead pack directly to ((qhead_per_kvhead, seqlen), d, nheads_kv, b) we'd have 5D TMA.
+    # Pack headdim and seqlen dim into 1: (seqlen, d, nheads, b) -> ((nheads, seqlen), d, b)
+    gmem_tensor = layout_utils.select(
+        gmem_tensor, [head_idx, *range(head_idx), *range(head_idx + 1, cute.rank(gmem_tensor))]
+    )
+    gmem_tensor = cute.group_modes(gmem_tensor, 0, 2)
+    assert cta_tiler[0] % qhead_per_kvhead == 0, (
+        "CTA tile size in the seqlen dimension must be divisible by qhead_per_kvhead"
+    )
+    tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(
+        op,
+        gmem_tensor,
+        smem_layout,
+        ((qhead_per_kvhead, cta_tiler[0] // qhead_per_kvhead), cta_tiler[1]),  # No mcast
+    )
+    # Unpack from ((nheads, seqlen), d, b) -> ((qhead_per_kvhead, seqlen), d, nheads_kv, b)
+    T = tma_tensor
+    shape_packed = (
+        (qhead_per_kvhead, T.shape[0][1]),
+        *[T.shape[i] for i in range(1, head_idx)],
+        T.shape[0][0] // qhead_per_kvhead,
+        *[T.shape[i] for i in range(head_idx, len(T.shape))],
+    )
+    stride_packed = (
+        *[T.stride[i] for i in range(head_idx)],
+        T.stride[0][0] * qhead_per_kvhead,
+        *[T.stride[i] for i in range(head_idx, len(T.shape))],
+    )
+    tma_tensor = cute.make_tensor(T.iterator, cute.make_layout(shape_packed, stride=stride_packed))
+    return tma_atom, tma_tensor
+def unpack_gqa_layout(T, qhead_per_kvhead, head_idx):
+    """Reverse of pack_gqa_layout: unfold qhead_per_kvhead from the seqlen dimension (mode 0).
+    The head dimension is at mode ``head_idx``.  Modes before it (1..head_idx-1)
+    are kept as-is (e.g. headdim for Q/O tensors), and modes after it are kept
+    as-is (e.g. batch).
+    For Q/O tensors (head_idx=2):
+        ((qhead_per_kvhead, seqlen_q), headdim, nheads_kv, batch, ...) -> (seqlen_q, headdim, nheads, batch, ...)
+    For LSE tensors (head_idx=1):
+        ((qhead_per_kvhead, seqlen_q), nheads_kv, batch, ...) -> (seqlen_q, nheads, batch, ...)
+    """
+    seqlen_stride = T.stride[0][1]
+    head_stride = T.stride[0][0]
+    shape_unpacked = (
+        T.shape[0][1],
+        *[T.shape[i] for i in range(1, head_idx)],
+        T.shape[head_idx] * qhead_per_kvhead,
+        *[T.shape[i] for i in range(head_idx + 1, len(T.shape))],
+    )
+    stride_unpacked = (
+        seqlen_stride,
+        *[T.stride[i] for i in range(1, head_idx)],
+        head_stride,
+        *[T.stride[i] for i in range(head_idx + 1, len(T.shape))],
+    )
+    return cute.make_tensor(T.iterator, cute.make_layout(shape_unpacked, stride=stride_unpacked))
+@dataclass
 class PackGQA:
+    m_block_size: cutlass.Constexpr[int]
+    head_dim_padded: cutlass.Constexpr[int]
+    check_hdim_oob: cutlass.Constexpr[bool]
+    qhead_per_kvhead: cutlass.Constexpr[bool]
     @cute.jit
     def compute_ptr(

build/torch-cuda/paged_kv.py CHANGED Viewed

@@ -28,6 +28,9 @@ class PagedKVManager(ParamsBase):
     head_dim_padded: cutlass.Constexpr[Int32]
     head_dim_v_padded: cutlass.Constexpr[Int32]
     gmem_threads_per_row: cutlass.Constexpr[Int32]
     page_entry_per_thread: Int32
     async_copy_elems: Int32
@@ -55,7 +58,11 @@ class PagedKVManager(ParamsBase):
         head_dim_v_padded: cutlass.Constexpr[Int32],
         num_threads: cutlass.Constexpr[Int32],
         dtype: Type[cutlass.Numeric],
     ):
         universal_copy_bits = 128
         async_copy_elems = universal_copy_bits // dtype.width
         dtype_bytes = dtype.width // 8
@@ -97,7 +104,8 @@ class PagedKVManager(ParamsBase):
         else:
             cV = cute.make_identity_tensor((n_block_size, head_dim_v_padded))
             tVcV = gmem_thr_copy_KV.partition_S(cV)
-            tVpV = utils.predicate_k(tVcV, limit=mV_paged.shape[0])
         return PagedKVManager(
             mPageTable,
@@ -111,6 +119,8 @@ class PagedKVManager(ParamsBase):
             num_threads,
             head_dim_padded,
             head_dim_v_padded,
             gmem_threads_per_row,
             page_entry_per_thread,
             async_copy_elems,
@@ -146,13 +156,17 @@ class PagedKVManager(ParamsBase):
     @cute.jit
     def compute_X_ptr(self, K_or_V: str):
         tPrXPtr = cute.make_rmem_tensor((self.page_entry_per_thread,), cutlass.Int64)
         for i in cutlass.range(self.page_entry_per_thread, unroll=1):
             page = self.tPrPage[i]
             page_offset = self.tPrPageOffset[i]
-            if const_expr(K_or_V == "K"):
-                tPrXPtr[i] = utils.elem_pointer(self.mK_paged, (page_offset, 0, page)).toint()
             else:
-                tPrXPtr[i] = utils.elem_pointer(self.mV_paged, (0, page_offset, page)).toint()
         return tPrXPtr
     @cute.jit
@@ -161,18 +175,24 @@ class PagedKVManager(ParamsBase):
         tPrXPtr = self.compute_X_ptr(K_or_V)
-        # Finesse sX layout to be (M, N).
-        sX_pi = cute.make_tensor(
-            sX.iterator,
-            cute.make_layout(
-                (sX.shape[0][0], (sX.shape[0][1], sX.shape[2])),
-                stride=(sX.stride[0][0], (sX.stride[0][1], sX.stride[2])),
-            ),
-        )
-        if const_expr(K_or_V == "V"):
-            # Need to transpose V
-            sX_pi = cute.make_tensor(sX_pi.iterator, cute.select(sX_pi.layout, mode=[1, 0]))
         head_dim = self.head_dim_v_padded if const_expr(K_or_V == "V") else self.head_dim_padded
         cX = cute.make_identity_tensor((self.n_block_size, head_dim))

     head_dim_padded: cutlass.Constexpr[Int32]
     head_dim_v_padded: cutlass.Constexpr[Int32]
+    arch: cutlass.Constexpr[Int32]
+    v_gmem_transposed: cutlass.Constexpr[bool]
     gmem_threads_per_row: cutlass.Constexpr[Int32]
     page_entry_per_thread: Int32
     async_copy_elems: Int32
         head_dim_v_padded: cutlass.Constexpr[Int32],
         num_threads: cutlass.Constexpr[Int32],
         dtype: Type[cutlass.Numeric],
+        arch: cutlass.Constexpr[int] = 100,
     ):
+        # SM100 transposes V in gmem to (dv, page_size, num_pages);
+        # SM90 keeps V as (page_size, dv, num_pages), same layout as K.
+        v_gmem_transposed = arch != 90
         universal_copy_bits = 128
         async_copy_elems = universal_copy_bits // dtype.width
         dtype_bytes = dtype.width // 8
         else:
             cV = cute.make_identity_tensor((n_block_size, head_dim_v_padded))
             tVcV = gmem_thr_copy_KV.partition_S(cV)
+            # When V is transposed in gmem, dv is shape[0]; otherwise dv is shape[1] (same as K)
+            tVpV = utils.predicate_k(tVcV, limit=mV_paged.shape[0 if v_gmem_transposed else 1])
         return PagedKVManager(
             mPageTable,
             num_threads,
             head_dim_padded,
             head_dim_v_padded,
+            arch,
+            v_gmem_transposed,
             gmem_threads_per_row,
             page_entry_per_thread,
             async_copy_elems,
     @cute.jit
     def compute_X_ptr(self, K_or_V: str):
         tPrXPtr = cute.make_rmem_tensor((self.page_entry_per_thread,), cutlass.Int64)
+        mX = self.mK_paged if const_expr(K_or_V == "K") else self.mV_paged
+        # K is always (page_size, d, num_pages). V matches K when not transposed,
+        # but is (dv, page_size, num_pages) when transposed (SM100).
+        transposed = const_expr(K_or_V == "V" and self.v_gmem_transposed)
         for i in cutlass.range(self.page_entry_per_thread, unroll=1):
             page = self.tPrPage[i]
             page_offset = self.tPrPageOffset[i]
+            if const_expr(transposed):
+                tPrXPtr[i] = utils.elem_pointer(mX, (0, page_offset, page)).toint()
             else:
+                tPrXPtr[i] = utils.elem_pointer(mX, (page_offset, 0, page)).toint()
         return tPrXPtr
     @cute.jit
         tPrXPtr = self.compute_X_ptr(K_or_V)
+        if const_expr(self.arch == 90):
+            # SM90: sX is already stage-sliced by caller (sK[None, None, stage]).
+            # Flatten hierarchical modes to get (n_block_size, head_dim).
+            sX_pi = cute.group_modes(sX, 0, 1)
+            # SM90 does NOT transpose V here (it's transposed via utils.transpose_view before MMA)
+        else:
+            # SM100: Finesse sX layout to be (M, N).
+            sX_pi = cute.make_tensor(
+                sX.iterator,
+                cute.make_layout(
+                    (sX.shape[0][0], (sX.shape[0][1], sX.shape[2])),
+                    stride=(sX.stride[0][0], (sX.stride[0][1], sX.stride[2])),
+                ),
+            )
+            if const_expr(K_or_V == "V"):
+                # Transpose smem V to match transposed gmem layout
+                sX_pi = cute.make_tensor(sX_pi.iterator, cute.select(sX_pi.layout, mode=[1, 0]))
         head_dim = self.head_dim_v_padded if const_expr(K_or_V == "V") else self.head_dim_padded
         cX = cute.make_identity_tensor((self.n_block_size, head_dim))

build/torch-cuda/pipeline.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Copyright (c) 2025, Tri Dao.
-# import math
 from typing import Optional
 from dataclasses import dataclass
@@ -11,12 +10,31 @@ from cutlass.pipeline import PipelineState
 from cutlass.pipeline import PipelineUserType
 from cutlass.pipeline import NamedBarrier as NamedBarrierOg
 from cutlass.pipeline import PipelineAsync as PipelineAsyncOg
 from cutlass.pipeline import PipelineTmaAsync as PipelineTmaAsyncOg
 from cutlass.pipeline import PipelineTmaUmma as PipelineTmaUmmaOg
 from cutlass.pipeline import PipelineUmmaAsync as PipelineUmmaAsyncOg
 from cutlass.pipeline import PipelineAsyncUmma as PipelineAsyncUmmaOg
 class PipelineStateSimple:
     """
     Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
@@ -25,9 +43,6 @@ class PipelineStateSimple:
     """
     def __init__(self, stages: int, phase_index: Int32):
-        # assert stages < 2**16
-        # self._log_stages = int(math.log2(stages))
-        # assert 1 << self._log_stages == stages, "Number of stages must be a power of 2."
         self._stages = stages
         self._phase_index = phase_index
@@ -36,13 +51,10 @@ class PipelineStateSimple:
     @property
     def stages(self) -> int:
-        # return 1 << self._log_stages
         return self._stages
     @property
     def index(self) -> Int32:
-        # return self._phase_index & 0xFFFF
-        # return self._phase_index & ((1 << self._log_stages) - 1)
         if const_expr(self._stages == 1):
             return Int32(0)
         else:
@@ -50,11 +62,8 @@ class PipelineStateSimple:
     @property
     def phase(self) -> Int32:
-        # return self._phase_index >> 16
         # PTX docs say that the phase parity needs to be 0 or 1, so by right we need to
         # take modulo 2. But in practice just passing the phase in without modulo works fine.
-        # return (self._phase_index >> self._log_stages) % 2
-        # return self._phase_index >> self._log_stages
         if const_expr(self._stages == 1):
             return self._phase_index
         else:
@@ -66,21 +75,6 @@ class PipelineStateSimple:
         else:
             self._phase_index += 1
-        # def then_body(phase_index):
-        #     # XOR the phase bit and set the index to 0
-        #     return (phase_index & 0xFFFF0000) ^ (1 << 16)
-        # def else_body(phase_index):
-        #     return phase_index
-        # self._phase_index = if_generate(
-        #     (self._phase_index & 0xFFFF) == self.stages,
-        #     then_body,
-        #     else_body,
-        #     [self._phase_index],
-        #     [Int32],
-        # )
     def __extract_mlir_values__(self):
         phase_index = self._phase_index
         return [phase_index.ir_value()]
@@ -94,7 +88,6 @@ def make_pipeline_state(type: PipelineUserType, stages: int):
     Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
     """
     if type is PipelineUserType.Producer:
-        # return PipelineStateSimple(stages, Int32(1 << 16))
         return PipelineStateSimple(stages, Int32(stages))
     elif type is PipelineUserType.Consumer:
         return PipelineStateSimple(stages, Int32(0))
@@ -102,14 +95,73 @@ def make_pipeline_state(type: PipelineUserType, stages: int):
         assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
 @dataclass(frozen=True)
 class NamedBarrier(NamedBarrierOg):
-    @staticmethod
-    def create(*args, **kwargs):
-        obj = NamedBarrierOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        object.__setattr__(obj, "__class__", NamedBarrier)
-        return obj
     @dsl_user_op
     def arrive_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
@@ -134,72 +186,121 @@ class NamedBarrier(NamedBarrierOg):
         )
 @dataclass(frozen=True)
-class PipelineAsync(PipelineAsyncOg):
     @staticmethod
-    def create(*args, **kwargs):
         obj = PipelineAsyncOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        # obj.__class__ = PipelineAsync
         object.__setattr__(obj, "__class__", PipelineAsync)
         return obj
     @dsl_user_op
-    def producer_acquire_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_acquire_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
         )
     @dsl_user_op
-    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
-        self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip)
-    @dsl_user_op
-    def consumer_wait_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_wait_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_wait_token is None or try_wait_token == 0,
-            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
         )
-    @dsl_user_op
-    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
-        self.sync_object_empty.arrive(index, self.consumer_mask, loc=loc, ip=ip)
 @dataclass(frozen=True)
-class PipelineTmaAsync(PipelineTmaAsyncOg):
-    """
-    Override producer_acquire to take in extra_tx_count parameter.
-    """
     @staticmethod
-    def create(*args, **kwargs):
-        obj = PipelineTmaAsyncOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        object.__setattr__(obj, "__class__", PipelineTmaAsync)
         return obj
     @dsl_user_op
     def producer_acquire(
         self,
@@ -226,19 +327,15 @@ class PipelineTmaAsync(PipelineTmaAsyncOg):
             self.sync_object_full.arrive_and_expect_tx(state.index, tx_count, loc=loc, ip=ip)
-@dataclass(frozen=True)
-class PipelineTmaUmma(PipelineTmaUmmaOg):
-    """
-    Override producer_acquire to take in extra_tx_count parameter.
-    """
-    @staticmethod
-    def create(*args, **kwargs):
-        obj = PipelineTmaUmmaOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        # obj.__class__ = PipelineTmaUmma
-        object.__setattr__(obj, "__class__", PipelineTmaUmma)
-        return obj
     @dsl_user_op
     def producer_acquire(
@@ -279,162 +376,27 @@ class PipelineTmaUmma(PipelineTmaUmmaOg):
                 ip=ip,
             )
-    @dsl_user_op
-    def producer_acquire_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_acquire_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        """
-        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
-        """
-        if_generate(
-            try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-        if_generate(
-            self.is_leader_cta,
-            lambda: self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def consumer_wait_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_wait_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_wait_token is None or try_wait_token == 0,
-            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
-        """
-        UMMA consumer release buffer empty, cta_group needs to be provided.
-        """
-        self.sync_object_empty.arrive(index, self.consumer_mask, self.cta_group, loc=loc, ip=ip)
 @dataclass(frozen=True)
-class PipelineUmmaAsync(PipelineUmmaAsyncOg):
-    @staticmethod
-    def create(*args, **kwargs):
-        obj = PipelineUmmaAsyncOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        object.__setattr__(obj, "__class__", PipelineUmmaAsync)
-        return obj
-    @dsl_user_op
-    def producer_acquire_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_acquire_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
-        """
-        UMMA producer commit buffer full, cta_group needs to be provided.
-        """
-        self.sync_object_full.arrive(index, self.producer_mask, self.cta_group, loc=loc, ip=ip)
-    @dsl_user_op
-    def consumer_wait_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_wait_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_wait_token is None or try_wait_token == 0,
-            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
-        self.sync_object_empty.arrive(index, self.consumer_mask, loc=loc, ip=ip)
 @dataclass(frozen=True)
-class PipelineAsyncUmma(PipelineAsyncUmmaOg):
-    @staticmethod
-    def create(*args, **kwargs):
-        obj = PipelineAsyncUmmaOg.create(*args, **kwargs)
-        # Can't assign to __class__ directly since the dataclass is frozen
-        object.__setattr__(obj, "__class__", PipelineAsyncUmma)
-        return obj
-    @dsl_user_op
-    def producer_acquire_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_acquire_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_acquire_token is None or try_acquire_token == 0,
-            lambda: self.sync_object_empty.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
-        self.sync_object_full.arrive(index, self.producer_mask, loc=loc, ip=ip)
-    @dsl_user_op
-    def consumer_wait_w_index_phase(
-        self,
-        index: Int32,
-        phase: Int32,
-        try_wait_token: Optional[Boolean] = None,
-        *,
-        loc=None,
-        ip=None,
-    ):
-        if_generate(
-            try_wait_token is None or try_wait_token == 0,
-            lambda: self.sync_object_full.wait(index, phase, loc=loc, ip=ip),
-            loc=loc,
-            ip=ip,
-        )
-    @dsl_user_op
-    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
-        """
-        UMMA consumer release buffer empty, cta_group needs to be provided.
-        """
-        self.sync_object_empty.arrive(index, self.consumer_mask, self.cta_group, loc=loc, ip=ip)

 # Copyright (c) 2025, Tri Dao.
 from typing import Optional
 from dataclasses import dataclass
 from cutlass.pipeline import PipelineUserType
 from cutlass.pipeline import NamedBarrier as NamedBarrierOg
 from cutlass.pipeline import PipelineAsync as PipelineAsyncOg
+from cutlass.pipeline import PipelineCpAsync as PipelineCpAsyncOg
 from cutlass.pipeline import PipelineTmaAsync as PipelineTmaAsyncOg
 from cutlass.pipeline import PipelineTmaUmma as PipelineTmaUmmaOg
 from cutlass.pipeline import PipelineUmmaAsync as PipelineUmmaAsyncOg
 from cutlass.pipeline import PipelineAsyncUmma as PipelineAsyncUmmaOg
+def _override_create(parent_cls, child_cls):
+    """Create a static factory that constructs parent_cls then re-classes to child_cls."""
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = parent_cls.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", child_cls)
+        return obj
+    return create
+def _make_state(index: Int32, phase: Int32) -> PipelineState:
+    """Construct a PipelineState from index and phase (count/stages unused by callers)."""
+    return PipelineState(stages=0, count=Int32(0), index=index, phase=phase)
 class PipelineStateSimple:
     """
     Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
     """
     def __init__(self, stages: int, phase_index: Int32):
         self._stages = stages
         self._phase_index = phase_index
     @property
     def stages(self) -> int:
         return self._stages
     @property
     def index(self) -> Int32:
         if const_expr(self._stages == 1):
             return Int32(0)
         else:
     @property
     def phase(self) -> Int32:
         # PTX docs say that the phase parity needs to be 0 or 1, so by right we need to
         # take modulo 2. But in practice just passing the phase in without modulo works fine.
         if const_expr(self._stages == 1):
             return self._phase_index
         else:
         else:
             self._phase_index += 1
     def __extract_mlir_values__(self):
         phase_index = self._phase_index
         return [phase_index.ir_value()]
     Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
     """
     if type is PipelineUserType.Producer:
         return PipelineStateSimple(stages, Int32(stages))
     elif type is PipelineUserType.Consumer:
         return PipelineStateSimple(stages, Int32(0))
         assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+# ── Shared helpers ───────────────────────────────────────────────────────────
+def _call_with_elect_one(parent_method, self, state, elect_one, syncwarp, loc, ip):
+    """Optionally wrap a parent pipeline method call in sync_warp + elect_one."""
+    if const_expr(elect_one):
+        if const_expr(syncwarp):
+            cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            parent_method(self, state, loc=loc, ip=ip)
+    else:
+        parent_method(self, state, loc=loc, ip=ip)
+# ── Mixin: _w_index / _w_index_phase variants that delegate to parent ───────
+# Each parent class has PipelineState-based methods (producer_acquire, producer_commit,
+# consumer_wait, consumer_release). The _w_index_phase variants just construct a
+# PipelineState from (index, phase) and delegate.
+class _PipelineIndexPhaseMixin:
+    """Mixin providing _w_index_phase / _w_index methods that delegate to PipelineState-based parents."""
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        state = _make_state(index, phase)
+        # Call the parent's producer_acquire (which takes PipelineState)
+        self.producer_acquire(state, try_acquire_token, loc=loc, ip=ip)
+    @dsl_user_op
+    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
+        state = _make_state(index, Int32(0))
+        self.producer_commit(state, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        state = _make_state(index, phase)
+        self.consumer_wait(state, try_wait_token, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        state = _make_state(index, Int32(0))
+        self.consumer_release(state, loc=loc, ip=ip)
+# ── NamedBarrier ─────────────────────────────────────────────────────────────
 @dataclass(frozen=True)
 class NamedBarrier(NamedBarrierOg):
+    create = _override_create(NamedBarrierOg, None)  # patched below
     @dsl_user_op
     def arrive_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
         )
+NamedBarrier.create = _override_create(NamedBarrierOg, NamedBarrier)
+# ── PipelineAsync ────────────────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineAsync(_PipelineIndexPhaseMixin, PipelineAsyncOg):
+    """
+    PipelineAsync with optional elect_one for producer_commit and consumer_release.
+    When elect_one_*=True (set at create time), only one elected thread per warp
+    signals the barrier arrive. This is useful when the mask count is set to 1 per warp.
+    Args (to create):
+        elect_one_commit: If True, only elected thread signals producer_commit.
+        syncwarp_before_commit: If True (default), issue syncwarp before elect_one.
+        elect_one_release: If True, only elected thread signals consumer_release.
+        syncwarp_before_release: If True (default), issue syncwarp before elect_one.
+            Set syncwarp to False when threads are already converged (e.g. after wgmma wait_group).
+    """
+    _elect_one_commit: bool = False
+    _syncwarp_before_commit: bool = True
+    _elect_one_release: bool = False
+    _syncwarp_before_release: bool = True
     @staticmethod
+    def create(
+        *args,
+        elect_one_commit: bool = False,
+        syncwarp_before_commit: bool = True,
+        elect_one_release: bool = False,
+        syncwarp_before_release: bool = True,
+        **kwargs,
+    ):
         obj = PipelineAsyncOg.create(*args, **kwargs)
         object.__setattr__(obj, "__class__", PipelineAsync)
+        object.__setattr__(obj, "_elect_one_commit", elect_one_commit)
+        object.__setattr__(obj, "_syncwarp_before_commit", syncwarp_before_commit)
+        object.__setattr__(obj, "_elect_one_release", elect_one_release)
+        object.__setattr__(obj, "_syncwarp_before_release", syncwarp_before_release)
         return obj
     @dsl_user_op
+    def producer_commit(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineAsyncOg.producer_commit,
+            self,
+            state,
+            self._elect_one_commit,
+            self._syncwarp_before_commit,
+            loc,
+            ip,
         )
     @dsl_user_op
+    def consumer_release(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineAsyncOg.consumer_release,
+            self,
+            state,
+            self._elect_one_release,
+            self._syncwarp_before_release,
+            loc,
+            ip,
         )
+    # _w_index variants inherited from _PipelineIndexPhaseMixin, which delegate
+    # to producer_commit / consumer_release above.
+# ── PipelineCpAsync ──────────────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineCpAsync(_PipelineIndexPhaseMixin, PipelineCpAsyncOg):
+    _elect_one_release: bool = False
+    _syncwarp_before_release: bool = True
     @staticmethod
+    def create(
+        *args,
+        elect_one_release: bool = False,
+        syncwarp_before_release: bool = True,
+        **kwargs,
+    ):
+        obj = PipelineCpAsyncOg.create(*args, **kwargs)
+        object.__setattr__(obj, "__class__", PipelineCpAsync)
+        object.__setattr__(obj, "_elect_one_release", elect_one_release)
+        object.__setattr__(obj, "_syncwarp_before_release", syncwarp_before_release)
         return obj
+    @dsl_user_op
+    def consumer_release(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineCpAsyncOg.consumer_release,
+            self,
+            state,
+            self._elect_one_release,
+            self._syncwarp_before_release,
+            loc,
+            ip,
+        )
+    # _w_index variants inherited from _PipelineIndexPhaseMixin.
+# ── PipelineTmaAsync ────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineTmaAsync(_PipelineIndexPhaseMixin, PipelineTmaAsyncOg):
+    """Override producer_acquire to take in extra_tx_count parameter."""
     @dsl_user_op
     def producer_acquire(
         self,
             self.sync_object_full.arrive_and_expect_tx(state.index, tx_count, loc=loc, ip=ip)
+PipelineTmaAsync.create = _override_create(PipelineTmaAsyncOg, PipelineTmaAsync)
+# ── PipelineTmaUmma ─────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineTmaUmma(_PipelineIndexPhaseMixin, PipelineTmaUmmaOg):
+    """Override producer_acquire to take in extra_tx_count parameter."""
     @dsl_user_op
     def producer_acquire(
                 ip=ip,
             )
+PipelineTmaUmma.create = _override_create(PipelineTmaUmmaOg, PipelineTmaUmma)
+# ── PipelineUmmaAsync ───────────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineUmmaAsync(_PipelineIndexPhaseMixin, PipelineUmmaAsyncOg):
+    pass
+PipelineUmmaAsync.create = _override_create(PipelineUmmaAsyncOg, PipelineUmmaAsync)
+# ── PipelineAsyncUmma ───────────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineAsyncUmma(_PipelineIndexPhaseMixin, PipelineAsyncUmmaOg):
+    pass
+PipelineAsyncUmma.create = _override_create(PipelineAsyncUmmaOg, PipelineAsyncUmma)

build/torch-cuda/quack/copy_utils.py CHANGED Viewed

@@ -15,6 +15,9 @@ from cutlass._mlir.dialects import llvm
 from cutlass._mlir import ir
 from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
 Sm100MmaPeerBitMask = 0xFEFFFFFF
@@ -41,6 +44,30 @@ def cvt_copy(
     cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
 @dsl_user_op
 def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
     dst = cute.make_rmem_tensor_like(src, src.element_type, loc=loc, ip=ip)
@@ -796,17 +823,17 @@ def gather_m_get_copy_fn(
     limit_m: Int32,
     limit_k: Int32,
 ) -> Callable:
-    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
-    tAsA = thr_copy_A.partition_D(sA)
     # k-major
     assert tAsA.shape[2] == 1
     tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
-    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
     if const_expr(not is_even_m_smem):
-        limit_m = min(limit_m, tile_shape_mk[0])
     elems_per_load = cute.size(tAsA.shape[0][0])
-    cA = cute.make_identity_tensor(tile_shape_mk)
     tAcA = thr_copy_A.partition_S(cA)
     t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
     # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
@@ -828,13 +855,13 @@ def gather_m_get_copy_fn(
         else:
             m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
-    mA_k = cute.logical_divide(mA, (None, tile_shape_mk[1]))
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
             tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
-            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         mA_cur = mA_k[None, (None, src_idx)]
@@ -997,11 +1024,162 @@ def gather_m_get_tma_copy_fn(
     tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
     def copy_fn(src_idx, dst_idx, tma_bar_ptr: cute.Pointer):
         col_idx = tile_K * src_idx
         for m in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
             row_indices = [tSR_rAIdx[v, m] for v in range(4)]
-            smem_ptr = tSR_sA[None, m, None, dst_idx].iterator
             with cute.arch.elect_one():
                 tma_gather4_load_fn(smem_ptr, tma_bar_ptr, col_idx, row_indices)
     return copy_fn

 from cutlass._mlir import ir
 from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
+from . import layout_utils
+from .utils import make_vector
 Sm100MmaPeerBitMask = 0xFEFFFFFF
     cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def sr_cvt_copy(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    seed: Int32,
+    tidx: Int32,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """Like cvt_copy but uses stochastic rounding for FP32 -> BF16 conversion."""
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    from .rounding import convert_f32_to_bf16_sr
+    from cutlass.cute.tensor import TensorSSA
+    src_cvt = cute.make_rmem_tensor_like(src, dst.element_type)
+    src_vec = src.load()
+    raw_vec = convert_f32_to_bf16_sr(src_vec, seed, tidx, loc=loc, ip=ip)
+    src_cvt.store(TensorSSA(raw_vec, src_vec.shape, dst.element_type))
+    src = src_cvt
+    cute.copy(tiled_copy, src, dst, loc=loc, ip=ip)
 @dsl_user_op
 def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
     dst = cute.make_rmem_tensor_like(src, src.element_type, loc=loc, ip=ip)
     limit_m: Int32,
     limit_k: Int32,
 ) -> Callable:
+    tile_M, tile_K = cute.size(sA, mode=[0]), cute.size(sA, mode=[1])
+    tAsA = partition_D_position_independent(thr_copy_A, sA)
     # k-major
     assert tAsA.shape[2] == 1
     tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
+    is_even_m_smem = tile_M % thr_copy_A.tiler_mn[0].shape == 0
     if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_M)
     elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor((tile_M, tile_K))
     tAcA = thr_copy_A.partition_S(cA)
     t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
     # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
         else:
             m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
+    mA_k = cute.logical_divide(mA, (None, tile_K))
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
             tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_K
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         mA_cur = mA_k[None, (None, src_idx)]
     tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
     def copy_fn(src_idx, dst_idx, tma_bar_ptr: cute.Pointer):
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
         col_idx = tile_K * src_idx
         for m in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
             row_indices = [tSR_rAIdx[v, m] for v in range(4)]
+            smem_ptr = tSR_sA_cur[None, m, None].iterator
             with cute.arch.elect_one():
                 tma_gather4_load_fn(smem_ptr, tma_bar_ptr, col_idx, row_indices)
     return copy_fn
+@cute.jit
+def gather_k_get_tma_copy_fn(
+    tma_atom: cute.CopyAtom,
+    sA: cute.Tensor,  # ((4, tile_K/4), (tile_M,), STAGE) — K-grouped load layout
+    sAIdx: cute.Tensor,  # (tile_K, a_prefetch_stage) — K indices in smem
+    col_idx: Int32,  # M offset in global tensor (contiguous dim for M-major)
+    warp_idx: Int32,
+    num_warps: int,
+    num_cta: int = 1,
+) -> Tuple[Callable, Callable]:
+    """Build a copy function for TMA gather4 in K dimension (M-major A).
+    Each gather4 instruction loads 4 K-columns × tile_M contiguous M-elements.
+    col_idx is the absolute M position in the global tensor.
+    K indices come from sAIdx (prefetched to smem by the scheduler warp).
+    Returns copy_fn(src_idx, dst_idx, tma_bar_ptr) which:
+      Issues gather4 calls with those K indices as row_indices
+    """
+    tile_K = cute.size(sAIdx, mode=[0])
+    assert tile_K % 4 == 0
+    cta_group = num_cta
+    # Tiled copy for loading K indices from smem to registers (4 per vector, across warps)
+    copy_AIdx_s2r = cute.make_tiled_copy_tv(
+        cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Int32, num_bits_per_copy=128),
+        cute.make_layout(num_warps),  # thr_layout
+        cute.make_layout(4),  # val_layout — 4 K indices per gather4
+    )
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    warp_copy_AIdx_s2r = copy_AIdx_s2r.get_slice(warp_idx)
+    tSR_sAIdx = warp_copy_AIdx_s2r.partition_S(sAIdx)  # (((4,1),4,4))
+    # ((4,1),4,(64,2),(1,4)):((64,0),1024,(1,4096),(0,8192))
+    tSR_sA = warp_copy_AIdx_s2r.partition_S(layout_utils.transpose_view(sA))
+    tma_desc_ptr = get_tma_desc_addr(tma_atom)
+    tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline,
+        src_idx,
+        dst_idx,
+        a_prefetch_consumer_state,
+    ) -> cute.Tensor:
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        tSR_rAIdx = load_s2r(tSR_sAIdx[None, None, dst_idx])
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return tSR_rAIdx
+    def copy_fn(src_idx, dst_idx, tSR_rAIdx, tma_bar_ptr: cute.Pointer):
+        # Issue gather4: col_idx = M position, row_indices = 4 K positions
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
+        gather_dim = cute.size(tSR_sA_cur, mode=[2, 0])  # Typically 64
+        for k in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
+            row_indices = [tSR_rAIdx[v, k] for v in range(4)]
+            for m in cutlass.range(cute.size(tSR_sA_cur, mode=[2, 1]), unroll_full=True):
+                smem_ptr = tSR_sA_cur[None, k, (None, m)].iterator
+                with cute.arch.elect_one():
+                    tma_gather4_load_fn(
+                        smem_ptr, tma_bar_ptr, col_idx + m * gather_dim, row_indices
+                    )
+    return copy_fn, prefetch_from_smem_fn
+# ---------------------------------------------------------------------------
+# Store helpers
+# ---------------------------------------------------------------------------
+@dsl_user_op
+@cute.jit
+def store(
+    ptr: cute.Pointer,
+    val,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Store a scalar value via cute.arch.store.
+    ptr:  cute.Pointer (any address space).
+    val:  DSL Numeric value.
+    pred: None → unconditional.  DSL Boolean → skipped when pred == 0.
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, type(val)(val), cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, type(val)(val), cop=cop, loc=loc, ip=ip)
+@dsl_user_op
+@cute.jit
+def store_v2(
+    ptr: cute.Pointer,
+    v0,
+    v1,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Vectorized store of 2 elements via cute.arch.store.
+    Packs v0, v1 into an MLIR <2 x T> vector.
+    ptr:  cute.Pointer (any address space, must be aligned for vector width).
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    vec = make_vector(type(v0), v0, v1, loc=loc, ip=ip)
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+@dsl_user_op
+@cute.jit
+def store_v4(
+    ptr: cute.Pointer,
+    v0,
+    v1,
+    v2,
+    v3,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Vectorized store of 4 elements via cute.arch.store.
+    Packs v0–v3 into an MLIR <4 x T> vector.
+    ptr:  cute.Pointer (any address space, must be aligned for vector width).
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    vec = make_vector(type(v0), v0, v1, v2, v3, loc=loc, ip=ip)
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)

build/torch-cuda/quack/cute_dsl_utils.py CHANGED Viewed

@@ -4,6 +4,9 @@ from typing import Tuple, get_origin
 from functools import lru_cache
 from dataclasses import dataclass, fields
 import torch
 try:
@@ -14,7 +17,6 @@ except ImportError:
 import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Int64, Float16, BFloat16, Float32
-from cutlass.base_dsl.typing import JitArgument
 from cutlass.base_dsl.tvm_ffi_builder import spec
 from cutlass.cutlass_dsl import NumericMeta
@@ -65,8 +67,25 @@ def get_max_active_clusters(cluster_size):
     return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
 @lru_cache
 def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
     return torch.cuda.get_device_capability(device)
@@ -138,28 +157,3 @@ class ParamsBase:
         return values
     __new_from_mlir_values__ = _new_from_mlir_values
-@dataclass
-class ArgumentsBase(JitArgument):
-    def __c_pointers__(self):
-        _, non_constexpr_fields = _partition_fields(self)
-        c_ptrs = []
-        for obj in non_constexpr_fields.values():
-            if hasattr(obj, "__c_pointers__"):
-                c_ptrs.extend(obj.__c_pointers__())
-        return c_ptrs
-    def __get_mlir_types__(self):
-        _, non_constexpr_fields = _partition_fields(self)
-        types, self._values_pos = [], []
-        for obj in non_constexpr_fields.values():
-            if hasattr(obj, "__get_mlir_types__"):
-                obj_types = obj.__get_mlir_types__()
-                types.extend(obj_types)
-                self._values_pos.append(len(obj_types))
-            else:
-                self._values_pos.append(0)
-        return types
-    __new_from_mlir_values__ = _new_from_mlir_values

 from functools import lru_cache
 from dataclasses import dataclass, fields
+import os
+import re
 import torch
 try:
 import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Int64, Float16, BFloat16, Float32
 from cutlass.base_dsl.tvm_ffi_builder import spec
 from cutlass.cutlass_dsl import NumericMeta
     return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+def _parse_arch_str(arch_str: str) -> Tuple[int, int]:
+    """Parse arch string (e.g. 'sm_90', 'sm90', '90', 'sm_100a') to (major, minor) tuple."""
+    match = re.match(r"^(?:sm_?)?(\d+)(\d)([af]?)$", arch_str.strip(), re.IGNORECASE)
+    if not match:
+        raise ValueError(f"Invalid QUACK_ARCH format: {arch_str!r} (expected e.g. '90', 'sm_90')")
+    major, minor, _ = match.groups()
+    return int(major), int(minor)
 @lru_cache
 def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
+    """Return (major, minor) device capability.
+    Override with QUACK_ARCH (e.g. 'sm_90' or '90') for CPU-only compilation
+    without a GPU present.
+    """
+    arch_override = os.environ.get("QUACK_ARCH")
+    if arch_override is not None:
+        return _parse_arch_str(arch_override)
     return torch.cuda.get_device_capability(device)
         return values
     __new_from_mlir_values__ = _new_from_mlir_values

build/torch-cuda/quack/layout_utils.py CHANGED Viewed

@@ -295,3 +295,37 @@ def mma_partition_A_vec(
     sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
     tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
     return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

     sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
     tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
     return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def copy_partition_S_vec(
+    sVec: cute.Tensor, thr_copy: cute.core.ThrCopy, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_thr = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = reshape_acc_to_mn(thr_copy.partition_S(sVec_thr))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def copy_partition_D_vec(
+    sVec: cute.Tensor, thr_copy: cute.core.ThrCopy, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_thr = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = reshape_acc_to_mn(thr_copy.partition_D(sVec_thr))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

build/torch-cuda/quack/utils.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import math
+from typing import Optional, Tuple, Union
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass._mlir.dialects import arith as _arith
+from cutlass._mlir.dialects import llvm, nvvm, vector
+from cutlass.cutlass_dsl import T, dsl_user_op
+@dsl_user_op
+def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
+    return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
+@cute.jit
+def load_scalar_or_pointer(x, dtype=Float32):
+    if const_expr(isinstance(x, cute.Pointer)):
+        return dtype(cute.make_tensor(x, cute.make_layout(1))[0])
+    else:
+        return x
+@dsl_user_op
+def set_block_rank(
+    smem_ptr: cute.Pointer, peer_cta_rank_in_cluster: Int32, *, loc=None, ip=None
+) -> Int32:
+    """Map the given smem pointer to the address at another CTA rank in the cluster."""
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    return Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [smem_ptr_i32, peer_cta_rank_in_cluster.ir_value()],
+            "mapa.shared::cluster.u32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def store_shared_remote(
+    val: float | Float32 | Int32 | cutlass.Int64,
+    smem_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: cute.typing.Int,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    remote_smem_ptr_i32 = set_block_rank(
+        smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    remote_mbar_ptr_i32 = set_block_rank(
+        mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    if const_expr(isinstance(val, float)):
+        val = Float32(val)
+    assert isinstance(val, (Float32, Int32, cutlass.Int64)), "val must be Float32, Int32, or Int64"
+    suffix = {Float32: "f32", Int32: "s32", cutlass.Int64: "s64"}[type(val)]
+    constraint = {Float32: "f", Int32: "r", cutlass.Int64: "l"}[type(val)]
+    llvm.inline_asm(
+        None,
+        [remote_smem_ptr_i32, val.ir_value(loc=loc, ip=ip), remote_mbar_ptr_i32],
+        f"st.async.shared::cluster.mbarrier::complete_tx::bytes.{suffix} [$0], $1, [$2];",
+        f"r,{constraint},r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def store_shared_remote_x4(
+    val0: Float32 | Int32,
+    val1: Float32 | Int32,
+    val2: Float32 | Int32,
+    val3: Float32 | Int32,
+    smem_ptr: cute.Pointer,
+    mbar_ptr: cute.Pointer,
+    peer_cta_rank_in_cluster: cute.typing.Int,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    remote_smem_ptr_i32 = set_block_rank(
+        smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    remote_mbar_ptr_i32 = set_block_rank(
+        mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
+    ).ir_value()
+    assert isinstance(val0, (Float32, Int32)), "val must be Float32, or Int32"
+    dtype = Float32 if isinstance(val0, Float32) else Int32
+    suffix = {Float32: "f32", Int32: "s32"}[dtype]
+    constraint = {Float32: "f", Int32: "r"}[dtype]
+    llvm.inline_asm(
+        None,
+        [
+            remote_smem_ptr_i32,
+            remote_mbar_ptr_i32,
+            dtype(val0).ir_value(loc=loc, ip=ip),
+            dtype(val1).ir_value(loc=loc, ip=ip),
+            dtype(val2).ir_value(loc=loc, ip=ip),
+            dtype(val3).ir_value(loc=loc, ip=ip),
+        ],
+        "{\n\t"
+        f".reg .v4 .{suffix} abcd;\n\t"
+        f"mov.{suffix} abcd.x, $2;\n\t"
+        f"mov.{suffix} abcd.y, $3;\n\t"
+        f"mov.{suffix} abcd.z, $4;\n\t"
+        f"mov.{suffix} abcd.w, $5;\n\t"
+        f"st.async.shared::cluster.mbarrier::complete_tx::bytes.v4.{suffix} [$0], abcd, [$1];\n\t"
+        "}\n",
+        f"r,r,{constraint},{constraint},{constraint},{constraint}",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def fmin(a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    if cutlass.const_expr(cutlass.CUDA_VERSION.major) == 12:
+        return Float32(
+            nvvm.fmin(
+                T.f32(),
+                Float32(a).ir_value(loc=loc, ip=ip),
+                Float32(b).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+    return Float32(
+        nvvm.fmin(
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+@dsl_user_op
+def sqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "sqrt.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def ceil(a: float | Float32, *, loc=None, ip=None) -> Int32:
+    return Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "cvt.rpi.ftz.s32.f32 $0, $1;",
+            "=r,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@cute.jit
+def fill_oob(tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cute.Numeric) -> None:
+    """Fill out-of-bounds values in shared memory tensor.
+    Args:
+        tXsX: Shared memory tensor to fill
+        tXpX: Predicate tensor indicating valid elements
+        fill_value: Value to fill OOB locations with
+    """
+    tXrX_fill = cute.make_rmem_tensor_like(tXsX[(None, 0), None, 0])
+    tXrX_fill.fill(fill_value)
+    for rest_v in cutlass.range_constexpr(tXsX.shape[0][1]):
+        for rest_k in cutlass.range_constexpr(tXsX.shape[2]):
+            if const_expr(tXpX is not None):
+                if not tXpX[rest_v, 0, rest_k]:
+                    cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+            else:
+                cute.autovec_copy(tXrX_fill, tXsX[(None, rest_v), None, rest_k])
+# ---------------------------------------------------------------------------
+# General-purpose DSL store / vector helpers
+# ---------------------------------------------------------------------------
+@dsl_user_op
+def make_vector(elem_type, *values, loc=None, ip=None):
+    """Build an MLIR vector <N x elem_type> from N scalar DSL values.
+    Example: make_vector(cutlass.Uint32, v0, v1) -> <2 x i32> MLIR vector
+    """
+    from cutlass._mlir import ir
+    n = len(values)
+    mlir_ty = elem_type.mlir_type
+    vec_ty = ir.VectorType.get([n], mlir_ty)
+    vec = llvm.mlir_undef(vec_ty, loc=loc, ip=ip)
+    for i, v in enumerate(values):
+        vec = vector.insertelement(
+            elem_type(v).ir_value(loc=loc, ip=ip),
+            vec,
+            position=_arith.constant(T.i32(), i, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    return vec
+@dsl_user_op
+def f32x2_to_i64(a: Float32, b: Float32, *, loc=None, ip=None) -> cutlass.Int64:
+    vec_f32x2 = vector.from_elements(
+        T.vector(2, T.f32()), (a.ir_value(), b.ir_value()), loc=loc, ip=ip
+    )
+    vec_i64x1 = vector.bitcast(T.vector(1, T.i64()), vec_f32x2)
+    res = cutlass.Int64(
+        vector.extract(vec_i64x1, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    return res
+@dsl_user_op
+def i64_to_f32x2(c: cutlass.Int64, *, loc=None, ip=None) -> Tuple[Float32, Float32]:
+    vec_i64x1 = vector.from_elements(T.vector(1, T.i64()), (c.ir_value(),), loc=loc, ip=ip)
+    vec_f32x2 = vector.bitcast(T.vector(2, T.f32()), vec_i64x1)
+    res0 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[0], loc=loc, ip=ip)
+    )
+    res1 = Float32(
+        vector.extract(vec_f32x2, dynamic_position=[], static_position=[1], loc=loc, ip=ip)
+    )
+    return res0, res1
+@cute.jit
+def warp_prefix_sum(val: Int32, lane: Optional[Int32] = None) -> Int32:
+    if const_expr(lane is None):
+        lane = cute.arch.lane_idx()
+    for i in cutlass.range_constexpr(int(math.log2(cute.arch.WARP_SIZE))):
+        offset = 1 << i
+        # Very important that we set mask_and_clamp to 0
+        partial_sum = cute.arch.shuffle_sync_up(val, offset=offset, mask_and_clamp=0)
+        if lane >= offset:
+            val += partial_sum
+    return val
+@dsl_user_op
+def atomic_inc_i32(a: int | Int32, gmem_ptr: cute.Pointer, *, loc=None, ip=None) -> Int32:
+    from cutlass import CUDA_VERSION
+    # * NVVM call based on nvvm version
+    if CUDA_VERSION.major == 12 and CUDA_VERSION.minor == 9:
+        # Old API: requires explicit result type as first positional argument
+        return nvvm.atomicrmw(
+            res=T.i32(), op=nvvm.AtomicOpKind.INC, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
+        )
+    else:
+        # New API: infers result type automatically
+        return nvvm.atomicrmw(
+            op=nvvm.AtomicOpKind.INC, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
+        )
+@dsl_user_op
+def atomic_add_i32(a: int | Int32, gmem_ptr: cute.Pointer, *, loc=None, ip=None) -> Int32:
+    from cutlass import CUDA_VERSION
+    # * NVVM call based on nvvm version
+    if CUDA_VERSION.major == 12 and CUDA_VERSION.minor == 9:
+        # Old API: requires explicit result type as first positional argument
+        return nvvm.atomicrmw(
+            res=T.i32(), op=nvvm.AtomicOpKind.ADD, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
+        )
+    else:
+        # New API: infers result type automatically
+        return nvvm.atomicrmw(
+            op=nvvm.AtomicOpKind.ADD, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
+        )
+@dsl_user_op
+def issue_clc_query_nomulticast(
+    mbar_ptr: cute.Pointer,
+    clc_response_ptr: cute.Pointer,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    The clusterlaunchcontrol.try_cancel instruction requests atomically cancelling the launch
+    of a cluster that has not started running yet. It asynchronously writes an opaque response
+    to shared memory indicating whether the operation succeeded or failed. On success, the
+    opaque response contains the ctaid of the first CTA of the canceled cluster.
+    :param mbar_ptr: A pointer to the mbarrier address in SMEM
+    :type mbar_ptr:  Pointer
+    :param clc_response_ptr: A pointer to the cluster launch control response address in SMEM
+    :type clc_response_ptr:  Pointer
+    """
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    clc_response_llvm_ptr = clc_response_ptr.llvm_ptr
+    nvvm.clusterlaunchcontrol_try_cancel(
+        clc_response_llvm_ptr,
+        mbar_llvm_ptr,
+        loc=loc,
+        ip=ip,
+    )

build/torch-cuda/seqlen_info.py CHANGED Viewed

@@ -5,6 +5,8 @@ import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, const_expr
 """
 This consolidates all the info related to sequence length. This is so that we can do all
 the gmem reads once at the beginning of each tile, rather than having to repeat these reads
@@ -14,34 +16,61 @@ to compute various things like n_block_min, n_block_max, etc.
 @dataclass(frozen=True)
 class SeqlenInfo:
-    offset: cutlass.Int32
-    seqlen: cutlass.Int32
     @staticmethod
     def create(
-        batch_idx: cutlass.Int32,
-        seqlen_static: cutlass.Int32,
         cu_seqlens: Optional[cute.Tensor] = None,
         seqused: Optional[cute.Tensor] = None,
     ):
         offset = 0 if const_expr(cu_seqlens is None) else cu_seqlens[batch_idx]
         if const_expr(seqused is not None):
             seqlen = seqused[batch_idx]
         elif const_expr(cu_seqlens is not None):
             seqlen = cu_seqlens[batch_idx + 1] - cu_seqlens[batch_idx]
         else:
             seqlen = seqlen_static
-        return SeqlenInfo(offset, seqlen)
 @dataclass(frozen=True)
 class SeqlenInfoQK:
-    offset_q: cutlass.Int32
-    offset_k: cutlass.Int32
-    padded_offset_q: cutlass.Int32
-    padded_offset_k: cutlass.Int32
-    seqlen_q: cutlass.Int32
-    seqlen_k: cutlass.Int32
     has_cu_seqlens_q: cutlass.Constexpr[bool]
     has_cu_seqlens_k: cutlass.Constexpr[bool]
     has_seqused_q: cutlass.Constexpr[bool]
@@ -49,27 +78,27 @@ class SeqlenInfoQK:
     @staticmethod
     def create(
-        batch_idx: cutlass.Int32,
-        seqlen_q_static: cutlass.Int32,
-        seqlen_k_static: cutlass.Int32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         mSeqUsedK: Optional[cute.Tensor] = None,
-        tile_m: cutlass.Constexpr[cutlass.Int32] = 128,
-        tile_n: cutlass.Constexpr[cutlass.Int32] = 128,
     ):
         offset_q = 0 if const_expr(mCuSeqlensQ is None) else mCuSeqlensQ[batch_idx]
         offset_k = 0 if const_expr(mCuSeqlensK is None) else mCuSeqlensK[batch_idx]
         padded_offset_q = (
             0
             if const_expr(mCuSeqlensQ is None)
-            else (offset_q + batch_idx * tile_m) // tile_m * tile_m
         )
         padded_offset_k = (
             0
             if const_expr(mCuSeqlensK is None)
-            else (offset_k + batch_idx * tile_n) // tile_n * tile_n
         )
         if const_expr(mSeqUsedQ is not None):
             seqlen_q = mSeqUsedQ[batch_idx]
@@ -87,10 +116,6 @@ class SeqlenInfoQK:
                 if const_expr(mCuSeqlensK is None)
                 else mCuSeqlensK[batch_idx + 1] - offset_k
             )
-        has_cu_seqlens_q: int = mCuSeqlensQ is not None
-        has_cu_seqlens_k: int = mCuSeqlensK is not None
-        has_seqused_q: int = mSeqUsedQ is not None
-        has_seqused_k: int = mSeqUsedK is not None
         return SeqlenInfoQK(
             offset_q,
             offset_k,
@@ -98,10 +123,10 @@ class SeqlenInfoQK:
             padded_offset_k,
             seqlen_q,
             seqlen_k,
-            has_cu_seqlens_q,
-            has_cu_seqlens_k,
-            has_seqused_q,
-            has_seqused_k,
         )
     def offset_batch_Q(
@@ -110,16 +135,38 @@ class SeqlenInfoQK:
         batch_idx: Int32,
         dim: int,
         padded: cutlass.Constexpr[bool] = False,
     ) -> cute.Tensor:
         """Seqlen must be the first dimension of mQ"""
-        if const_expr(not self.has_cu_seqlens_q):
-            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mQ) - 1 - dim)
-            return mQ[idx]
         else:
-            offset_q = self.offset_q if const_expr(not padded) else self.padded_offset_q
-            offset = offset_q if const_expr(cute.rank(mQ.shape[0]) == 1) else (0, offset_q)
-            idx = (offset,) + (0,) * (cute.rank(mQ) - 1)
-            return cute.domain_offset(idx, mQ)
     def offset_batch_K(
         self,
@@ -127,12 +174,114 @@ class SeqlenInfoQK:
         batch_idx: Int32,
         dim: int,
         padded: cutlass.Constexpr[bool] = False,
     ) -> cute.Tensor:
         """Seqlen must be the first dimension of mK"""
-        if const_expr(not self.has_cu_seqlens_k):
-            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mK) - 1 - dim)
-            return mK[idx]
         else:
-            offset_k = self.offset_k if const_expr(not padded) else self.padded_offset_k
-            idx = (offset_k,) + (0,) * (cute.rank(mK) - 1)
-            return cute.domain_offset(idx, mK)

 import cutlass.cute as cute
 from cutlass import Int32, const_expr
+from .quack import copy_utils
 """
 This consolidates all the info related to sequence length. This is so that we can do all
 the gmem reads once at the beginning of each tile, rather than having to repeat these reads
 @dataclass(frozen=True)
 class SeqlenInfo:
+    offset: Int32
+    offset_padded: Int32
+    seqlen: Int32
+    has_cu_seqlens: cutlass.Constexpr[bool] = False
     @staticmethod
     def create(
+        batch_idx: Int32,
+        seqlen_static: Int32,
         cu_seqlens: Optional[cute.Tensor] = None,
         seqused: Optional[cute.Tensor] = None,
+        tile: cutlass.Constexpr[int] = 128,
     ):
         offset = 0 if const_expr(cu_seqlens is None) else cu_seqlens[batch_idx]
+        offset_padded = (
+            0
+            if const_expr(cu_seqlens is None)
+            # Add divby so that the compiler knows the alignment when moving by offset_padded
+            else cute.assume((offset + batch_idx * tile) // tile * tile, divby=tile)
+        )
         if const_expr(seqused is not None):
             seqlen = seqused[batch_idx]
         elif const_expr(cu_seqlens is not None):
             seqlen = cu_seqlens[batch_idx + 1] - cu_seqlens[batch_idx]
         else:
             seqlen = seqlen_static
+        return SeqlenInfo(offset, offset_padded, seqlen, has_cu_seqlens=cu_seqlens is not None)
+    def offset_batch(
+        self,
+        mT: cute.Tensor,
+        batch_idx: Int32,
+        dim: int,
+        padded: cutlass.Constexpr[bool] = False,
+        multiple: int = 1,
+    ) -> cute.Tensor:
+        """Offset a tensor by batch index. batch dim is at position `dim`, seqlen is at dim=0."""
+        if const_expr(not self.has_cu_seqlens):
+            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mT) - 1 - dim)
+            return mT[idx]
+        else:
+            off = multiple * (self.offset if const_expr(not padded) else self.offset_padded)
+            offset = off if const_expr(cute.rank(mT.shape[0]) == 1) else (0, off)
+            idx = (offset,) + (None,) * (cute.rank(mT) - 1)
+            return cute.domain_offset(idx, mT)
 @dataclass(frozen=True)
 class SeqlenInfoQK:
+    offset_q: Int32
+    offset_k: Int32
+    padded_offset_q: Int32
+    padded_offset_k: Int32
+    seqlen_q: Int32
+    seqlen_k: Int32
     has_cu_seqlens_q: cutlass.Constexpr[bool]
     has_cu_seqlens_k: cutlass.Constexpr[bool]
     has_seqused_q: cutlass.Constexpr[bool]
     @staticmethod
     def create(
+        batch_idx: Int32,
+        seqlen_q_static: Int32,
+        seqlen_k_static: Int32,
         mCuSeqlensQ: Optional[cute.Tensor] = None,
         mCuSeqlensK: Optional[cute.Tensor] = None,
         mSeqUsedQ: Optional[cute.Tensor] = None,
         mSeqUsedK: Optional[cute.Tensor] = None,
+        tile_m: cutlass.Constexpr[Int32] = 128,
+        tile_n: cutlass.Constexpr[Int32] = 128,
     ):
         offset_q = 0 if const_expr(mCuSeqlensQ is None) else mCuSeqlensQ[batch_idx]
         offset_k = 0 if const_expr(mCuSeqlensK is None) else mCuSeqlensK[batch_idx]
         padded_offset_q = (
             0
             if const_expr(mCuSeqlensQ is None)
+            else cute.assume((offset_q + batch_idx * tile_m) // tile_m * tile_m, divby=tile_m)
         )
         padded_offset_k = (
             0
             if const_expr(mCuSeqlensK is None)
+            else cute.assume((offset_k + batch_idx * tile_n) // tile_n * tile_n, divby=tile_n)
         )
         if const_expr(mSeqUsedQ is not None):
             seqlen_q = mSeqUsedQ[batch_idx]
                 if const_expr(mCuSeqlensK is None)
                 else mCuSeqlensK[batch_idx + 1] - offset_k
             )
         return SeqlenInfoQK(
             offset_q,
             offset_k,
             padded_offset_k,
             seqlen_q,
             seqlen_k,
+            has_cu_seqlens_q=mCuSeqlensQ is not None,
+            has_cu_seqlens_k=mCuSeqlensK is not None,
+            has_seqused_q=mSeqUsedQ is not None,
+            has_seqused_k=mSeqUsedK is not None,
         )
     def offset_batch_Q(
         batch_idx: Int32,
         dim: int,
         padded: cutlass.Constexpr[bool] = False,
+        ragged: cutlass.Constexpr[bool] = False,
     ) -> cute.Tensor:
         """Seqlen must be the first dimension of mQ"""
+        if const_expr(not ragged):
+            if const_expr(not self.has_cu_seqlens_q):
+                idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mQ) - 1 - dim)
+                return mQ[idx]
+            else:
+                offset_q = self.offset_q if const_expr(not padded) else self.padded_offset_q
+                offset_q = offset_q if const_expr(cute.rank(mQ.shape[0]) == 1) else (None, offset_q)
+                idx = (offset_q,) + (None,) * (cute.rank(mQ) - 1)
+                return cute.domain_offset(idx, mQ)
         else:
+            if const_expr(not self.has_cu_seqlens_q):
+                offset_q = 0
+                idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mQ) - 1 - dim)
+                mQ = mQ[idx]
+            else:
+                offset_q = self.offset_q if const_expr(not padded) else self.padded_offset_q
+            if const_expr(cute.rank(mQ.shape[0]) == 1):
+                return copy_utils.offset_ragged_tensor(
+                    mQ, offset_q, self.seqlen_q, ragged_dim=0, ptr_shift=True
+                )
+            else:  # PackGQA
+                assert cute.rank(mQ.shape[0]) == 2
+                # Unpack before calling offset_ragged_tensor, then pack
+                idx = ((None, None),) + (None,) * (cute.rank(mQ) - 1)
+                mQ = mQ[idx]
+                mQ = copy_utils.offset_ragged_tensor(
+                    mQ, offset_q, self.seqlen_q, ragged_dim=1, ptr_shift=True
+                )
+                return cute.group_modes(mQ, 0, 2)
     def offset_batch_K(
         self,
         batch_idx: Int32,
         dim: int,
         padded: cutlass.Constexpr[bool] = False,
+        ragged: cutlass.Constexpr[bool] = False,
+        multiple: int = 1,
     ) -> cute.Tensor:
         """Seqlen must be the first dimension of mK"""
+        if const_expr(not ragged):
+            if const_expr(not self.has_cu_seqlens_k):
+                idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mK) - 1 - dim)
+                return mK[idx]
+            else:
+                offset_k = self.offset_k if const_expr(not padded) else self.padded_offset_k
+                offset_k *= multiple
+                idx = (offset_k,) + (None,) * (cute.rank(mK) - 1)
+                return cute.domain_offset(idx, mK)
         else:
+            if const_expr(not self.has_cu_seqlens_k):
+                offset_k = 0
+                idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mK) - 1 - dim)
+                mK = mK[idx]
+            else:
+                offset_k = self.offset_k if const_expr(not padded) else self.padded_offset_k
+                offset_k *= multiple
+            return copy_utils.offset_ragged_tensor(
+                mK, offset_k, self.seqlen_k, ragged_dim=0, ptr_shift=True
+            )
+@dataclass(frozen=True)
+class SeqlenInfoQKNewK:
+    """Sequence length info for append-KV with left-padding and new K support.
+    Extends SeqlenInfoQK with:
+    - leftpad_k: left padding for K (tokens to skip at the start of the KV cache)
+    - offset_k_new: offset into the new K tensor
+    - seqlen_k_og: original K length (before appending new K), excluding leftpad
+    - seqlen_k_new: length of new K to append
+    - seqlen_k: total K length (seqlen_k_og + seqlen_k_new)
+    - seqlen_rotary: position for rotary embedding computation
+    """
+    leftpad_k: Int32
+    offset_q: Int32
+    offset_k: Int32
+    offset_k_new: Int32
+    seqlen_q: Int32
+    seqlen_k_og: Int32
+    seqlen_k_new: Int32
+    seqlen_k: Int32
+    seqlen_rotary: Int32
+    @staticmethod
+    def create(
+        batch_idx: Int32,
+        seqlen_q_static: Int32,
+        seqlen_k_static: Int32,
+        shape_K_new_0: Int32,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mCuSeqlensKNew: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        mLeftpadK: Optional[cute.Tensor] = None,
+        mSeqlensRotary: Optional[cute.Tensor] = None,
+    ):
+        leftpad_k = 0 if const_expr(mLeftpadK is None) else mLeftpadK[batch_idx]
+        offset_q = 0 if const_expr(mCuSeqlensQ is None) else mCuSeqlensQ[batch_idx]
+        if const_expr(mCuSeqlensK is not None):
+            offset_k = mCuSeqlensK[batch_idx] + leftpad_k
+        else:
+            offset_k = leftpad_k if const_expr(mCuSeqlensQ is not None) else 0
+        offset_k_new = 0 if const_expr(mCuSeqlensKNew is None) else mCuSeqlensKNew[batch_idx]
+        # seqlen_q
+        if const_expr(mSeqUsedQ is not None):
+            seqlen_q = mSeqUsedQ[batch_idx]
+        elif const_expr(mCuSeqlensQ is not None):
+            seqlen_q = mCuSeqlensQ[batch_idx + 1] - mCuSeqlensQ[batch_idx]
+        else:
+            seqlen_q = seqlen_q_static
+        # seqlen_k_og: original K length (excluding leftpad)
+        if const_expr(mSeqUsedK is not None):
+            seqlen_k_og = mSeqUsedK[batch_idx] - leftpad_k
+        elif const_expr(mCuSeqlensK is not None):
+            seqlen_k_og = mCuSeqlensK[batch_idx + 1] - mCuSeqlensK[batch_idx] - leftpad_k
+        else:
+            seqlen_k_og = (
+                seqlen_k_static - leftpad_k
+                if const_expr(mCuSeqlensQ is not None)
+                else seqlen_k_static
+            )
+        # seqlen_k_new
+        if const_expr(mCuSeqlensKNew is None):
+            seqlen_k_new = 0 if const_expr(mCuSeqlensQ is None) else shape_K_new_0
+        else:
+            seqlen_k_new = mCuSeqlensKNew[batch_idx + 1] - mCuSeqlensKNew[batch_idx]
+        seqlen_k = seqlen_k_og if const_expr(mCuSeqlensQ is None) else seqlen_k_og + seqlen_k_new
+        # seqlen_rotary: defaults to seqlen_k_og + leftpad_k unless explicitly provided
+        if const_expr(mSeqlensRotary is not None):
+            seqlen_rotary = mSeqlensRotary[batch_idx]
+        else:
+            seqlen_rotary = seqlen_k_og + leftpad_k
+        return SeqlenInfoQKNewK(
+            leftpad_k,
+            offset_q,
+            offset_k,
+            offset_k_new,
+            seqlen_q,
+            seqlen_k_og,
+            seqlen_k_new,
+            seqlen_k,
+            seqlen_rotary,
+        )

build/torch-cuda/sm90_config_search.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""Search feasible SM90 fwd/bwd attention configs for given (head_dim, head_dim_v).
+Enumerates tile sizes, swap modes, atom layouts, and staging options.
+Checks GMMA divisibility, register budget, and shared memory budget.
+Usage:
+    python flash_attn/cute/sm90_config_search.py --headdim 128
+    python flash_attn/cute/sm90_config_search.py --mode fwd --headdim 192-128
+    python flash_attn/cute/sm90_config_search.py --mode bwd --headdim 192 --tile-n 64,96
+"""
+import math
+# H100 hardware limits
+SMEM_LIMIT = 224 * 1024  # 228 KB minus ~3 KB for LSE, dPsum, mbarriers
+REG_LIMITS = {2: 216, 3: 128}  # per-WG budget: 2WG=240-24, 3WG=160-32
+THREADS_PER_WG = 128
+def _divisors(n):
+    return [d for d in range(1, n + 1) if n % d == 0]
+def _acc_regs(M, N, num_wg):
+    """Accumulator registers per thread per WG."""
+    return M * N // (num_wg * THREADS_PER_WG)
+def _check_mma(M, N, num_wg, atom_layout_m, swap_AB):
+    """Check MMA feasibility. Returns regs per WG, or None if infeasible.
+    GMMA atom M=64. Swap exchanges (M, N) and atom layout.
+    Requires: M divisible by (atom_layout_m * 64), N by (atom_layout_n * 8).
+    """
+    if swap_AB:
+        M, N = N, M
+        atom_layout_m = num_wg // atom_layout_m
+    atom_layout_n = num_wg // atom_layout_m
+    if M % (atom_layout_m * 64) != 0 or N % (atom_layout_n * 8) != 0:
+        return None
+    return _acc_regs(M, N, num_wg)
+def _mma_traffic(M_eff, N_eff, K_red, num_wg, wg_n, is_rs=False):
+    """Total SMEM read traffic for one MMA (all WGs combined).
+    num_instr = (M_eff / 64) * wg_n instructions total.
+    Each reads A(64, K_red) and B(N_eff/wg_n, K_red) from smem (bf16).
+    """
+    num_instr = (M_eff // 64) * wg_n
+    A_per = 64 * K_red * 2 if not is_rs else 0
+    B_per = (N_eff // wg_n) * K_red * 2
+    return num_instr * (A_per + B_per)
+# ============================================================================
+# Backward
+# ============================================================================
+def _check_bwd_config(
+    hdim,
+    hdimv,
+    tile_m,
+    tile_n,
+    num_wg,
+    SdP_swapAB,
+    dKV_swapAB,
+    dQ_swapAB,
+    AtomLayoutMSdP,
+    AtomLayoutNdKV,
+    AtomLayoutMdQ,
+):
+    reg_limit = REG_LIMITS[num_wg]
+    # MMA feasibility
+    regs_SdP = _check_mma(tile_m, tile_n, num_wg, AtomLayoutMSdP, SdP_swapAB)
+    regs_dK = _check_mma(tile_n, hdim, num_wg, AtomLayoutNdKV, dKV_swapAB)
+    regs_dV = _check_mma(tile_n, hdimv, num_wg, AtomLayoutNdKV, dKV_swapAB)
+    regs_dQ = _check_mma(tile_m, hdim, num_wg, AtomLayoutMdQ, dQ_swapAB)
+    if any(r is None for r in (regs_SdP, regs_dK, regs_dV, regs_dQ)):
+        return None
+    # Peak regs: max(S+dP, dQ) + dK + dV
+    total_regs = max(2 * regs_SdP, regs_dQ) + regs_dK + regs_dV
+    if total_regs > reg_limit:
+        return None
+    # SMEM
+    mma_dkv_is_rs = (
+        AtomLayoutMSdP == 1 and AtomLayoutNdKV == num_wg and SdP_swapAB and not dKV_swapAB
+    )
+    Q_stage, PdS_stage = 2, 1
+    for dO_stage in (2, 1):
+        sQ = tile_m * hdim * 2 * Q_stage
+        sK = tile_n * hdim * 2
+        sV = tile_n * hdimv * 2
+        sdO = tile_m * hdimv * 2 * dO_stage
+        sPdS = tile_m * tile_n * 2 * PdS_stage
+        sP = sPdS if not mma_dkv_is_rs else 0
+        sdQaccum = tile_m * hdim * 4
+        smem = sQ + sK + sV + sdO + sP + sPdS + sdQaccum
+        if smem <= SMEM_LIMIT:
+            break
+    else:
+        return None
+    # SMEM traffic
+    def _swap(a, b, s):
+        return (b, a) if s else (a, b)
+    def _wg_n(al_m, s):
+        return al_m if s else num_wg // al_m
+    M_s, N_s = _swap(tile_m, tile_n, SdP_swapAB)
+    wn_SdP = _wg_n(AtomLayoutMSdP, SdP_swapAB)
+    traffic_S = _mma_traffic(M_s, N_s, hdim, num_wg, wn_SdP)
+    traffic_dP = _mma_traffic(M_s, N_s, hdimv, num_wg, wn_SdP)
+    wn_dKV = _wg_n(AtomLayoutNdKV, dKV_swapAB)
+    M_dv, N_dv = _swap(tile_n, hdimv, dKV_swapAB)
+    traffic_dV = _mma_traffic(M_dv, N_dv, tile_m, num_wg, wn_dKV, is_rs=mma_dkv_is_rs)
+    M_dk, N_dk = _swap(tile_n, hdim, dKV_swapAB)
+    traffic_dK = _mma_traffic(M_dk, N_dk, tile_m, num_wg, wn_dKV, is_rs=mma_dkv_is_rs)
+    M_dq, N_dq = _swap(tile_m, hdim, dQ_swapAB)
+    wn_dQ = _wg_n(AtomLayoutMdQ, dQ_swapAB)
+    traffic_dQ = _mma_traffic(M_dq, N_dq, tile_n, num_wg, wn_dQ)
+    traffic_P_store = tile_m * tile_n * 2 if not mma_dkv_is_rs else 0
+    traffic_dS_store = tile_m * tile_n * 2
+    traffic_dQ_smem = tile_m * hdim * 4 * 2  # store + TMA load
+    smem_traffic = (
+        traffic_S
+        + traffic_dP
+        + traffic_dV
+        + traffic_dK
+        + traffic_dQ
+        + traffic_P_store
+        + traffic_dS_store
+        + traffic_dQ_smem
+    )
+    return dict(
+        tile_m=tile_m,
+        tile_n=tile_n,
+        num_wg=num_wg,
+        Q_stage=Q_stage,
+        dO_stage=dO_stage,
+        PdS_stage=PdS_stage,
+        SdP_swapAB=SdP_swapAB,
+        dKV_swapAB=dKV_swapAB,
+        dQ_swapAB=dQ_swapAB,
+        AtomLayoutMSdP=AtomLayoutMSdP,
+        AtomLayoutNdKV=AtomLayoutNdKV,
+        AtomLayoutMdQ=AtomLayoutMdQ,
+        mma_dkv_is_rs=mma_dkv_is_rs,
+        regs_SdP=regs_SdP,
+        regs_dK=regs_dK,
+        regs_dV=regs_dV,
+        regs_dQ=regs_dQ,
+        total_regs=total_regs,
+        reg_limit=reg_limit,
+        smem_bytes=smem,
+        smem_kb=smem / 1024,
+        smem_traffic=smem_traffic,
+        smem_traffic_kb=smem_traffic / 1024,
+        smem_traffic_per_block=smem_traffic / (tile_m * tile_n),
+    )
+def find_feasible_bwd_configs(
+    head_dim,
+    head_dim_v=None,
+    tile_m_choices=(64, 80, 96, 112, 128),
+    tile_n_choices=(64, 80, 96, 112, 128),
+):
+    if head_dim_v is None:
+        head_dim_v = head_dim
+    hdim = int(math.ceil(head_dim / 32) * 32)
+    hdimv = int(math.ceil(head_dim_v / 32) * 32)
+    results = []
+    for num_wg in (2, 3):
+        divs = _divisors(num_wg)
+        for tile_m in tile_m_choices:
+            for tile_n in tile_n_choices:
+                for SdP_swap in (False, True):
+                    if (tile_n if SdP_swap else tile_m) % 64 != 0:
+                        continue
+                    for dKV_swap in (False, True):
+                        if not dKV_swap and tile_n % 64 != 0:
+                            continue
+                        if dKV_swap and (hdim % 64 != 0 or hdimv % 64 != 0):
+                            continue
+                        for dQ_swap in (False, True):
+                            if (hdim if dQ_swap else tile_m) % 64 != 0:
+                                continue
+                            for a1 in divs:
+                                for a2 in divs:
+                                    for a3 in divs:
+                                        cfg = _check_bwd_config(
+                                            hdim,
+                                            hdimv,
+                                            tile_m,
+                                            tile_n,
+                                            num_wg,
+                                            SdP_swap,
+                                            dKV_swap,
+                                            dQ_swap,
+                                            a1,
+                                            a2,
+                                            a3,
+                                        )
+                                        if cfg is not None:
+                                            results.append(cfg)
+    results.sort(key=lambda c: (-c["tile_n"], -c["tile_m"], c["smem_traffic_per_block"]))
+    return results
+def print_bwd_configs(configs, max_results=20):
+    if not configs:
+        print("No feasible configs found!")
+        return
+    n = min(len(configs), max_results)
+    print(f"Found {len(configs)} feasible configs (showing top {n}):\n")
+    hdr = (
+        f"{'wg':>2} {'tm':>3} {'tn':>3}  "
+        f"{'SdP':>3} {'dKV':>3} {'dQ':>3}  "
+        f"{'aSdP':>4} {'adKV':>4} {'adQ':>4}  "
+        f"{'Qs':>2} {'dOs':>3}  "
+        f"{'rS':>3} {'rdK':>3} {'rdV':>3} {'rdQ':>3} {'tot':>4}/{'':<3}  "
+        f"{'smem':>5}  {'traffic':>7}  {'tr/blk':>6}"
+    )
+    print(hdr)
+    print("-" * len(hdr))
+    B = lambda b: "T" if b else "F"
+    for c in configs[:max_results]:
+        print(
+            f"{c['num_wg']:>2} {c['tile_m']:>3} {c['tile_n']:>3}  "
+            f"{B(c['SdP_swapAB']):>3} {B(c['dKV_swapAB']):>3} {B(c['dQ_swapAB']):>3}  "
+            f"{c['AtomLayoutMSdP']:>4} {c['AtomLayoutNdKV']:>4} {c['AtomLayoutMdQ']:>4}  "
+            f"{c['Q_stage']:>2} {c['dO_stage']:>3}  "
+            f"{c['regs_SdP']:>3} {c['regs_dK']:>3} {c['regs_dV']:>3} {c['regs_dQ']:>3} "
+            f"{c['total_regs']:>4}/{c['reg_limit']:<3}  "
+            f"{c['smem_kb']:>4.0f}K  "
+            f"{c['smem_traffic_kb']:>6.0f}K  "
+            f"{c['smem_traffic_per_block']:>6.1f}"
+        )
+# ============================================================================
+# Forward
+# ============================================================================
+def _check_fwd_config(hdim, hdimv, tile_n, num_wg, pv_is_rs, overlap_wg):
+    reg_limit = REG_LIMITS[num_wg]
+    tile_m = num_wg * 64
+    if tile_n % 8 != 0:
+        return None
+    regs_S = _acc_regs(tile_m, tile_n, num_wg)
+    regs_O = _acc_regs(tile_m, hdimv, num_wg)
+    regs_P = regs_S // 2  # bf16 = half of f32
+    if overlap_wg:
+        total_regs = regs_S + regs_P + regs_O
+    else:
+        total_regs = regs_S + regs_O
+    if total_regs > reg_limit:
+        return None
+    # SMEM: 1 stage Q, 2 stages K/V, O overlaps Q, sP if not RS
+    sQ = tile_m * hdim * 2
+    sK = tile_n * hdim * 2 * 2
+    sV = tile_n * hdimv * 2 * 2
+    sO = tile_m * hdimv * 2
+    sP = tile_m * tile_n * 2 if not pv_is_rs else 0
+    smem = max(sQ, sO) + sK + sV + sP
+    if smem > SMEM_LIMIT:
+        return None
+    # SMEM traffic: num_instr = num_wg (all WGs in M, wg_n=1)
+    traffic_S = num_wg * (64 * hdim * 2 + tile_n * hdim * 2)
+    A_pv = 64 * tile_n * 2 if not pv_is_rs else 0
+    traffic_O = num_wg * (A_pv + hdimv * tile_n * 2)
+    traffic_P_store = tile_m * tile_n * 2 if not pv_is_rs else 0
+    smem_traffic = traffic_S + traffic_O + traffic_P_store
+    return dict(
+        tile_m=tile_m,
+        tile_n=tile_n,
+        num_wg=num_wg,
+        pv_is_rs=pv_is_rs,
+        overlap_wg=overlap_wg,
+        regs_S=regs_S,
+        regs_O=regs_O,
+        regs_P=regs_P,
+        total_regs=total_regs,
+        reg_limit=reg_limit,
+        smem_bytes=smem,
+        smem_kb=smem / 1024,
+        smem_traffic=smem_traffic,
+        smem_traffic_kb=smem_traffic / 1024,
+        smem_traffic_per_block=smem_traffic / (tile_m * tile_n),
+    )
+def find_feasible_fwd_configs(
+    head_dim, head_dim_v=None, tile_n_choices=(64, 80, 96, 112, 128, 144, 160, 176, 192)
+):
+    if head_dim_v is None:
+        head_dim_v = head_dim
+    hdim = int(math.ceil(head_dim / 32) * 32)
+    hdimv = int(math.ceil(head_dim_v / 32) * 32)
+    results = []
+    for num_wg in (2, 3):
+        for tile_n in tile_n_choices:
+            for pv_is_rs in (True, False):
+                for overlap_wg in (True, False):
+                    cfg = _check_fwd_config(hdim, hdimv, tile_n, num_wg, pv_is_rs, overlap_wg)
+                    if cfg is not None:
+                        results.append(cfg)
+    results.sort(key=lambda c: (-c["tile_n"], c["smem_traffic_per_block"]))
+    return results
+def print_fwd_configs(configs, max_results=20):
+    if not configs:
+        print("No feasible configs found!")
+        return
+    n = min(len(configs), max_results)
+    print(f"Found {len(configs)} feasible configs (showing top {n}):\n")
+    hdr = (
+        f"{'wg':>2} {'tm':>3} {'tn':>3}  "
+        f"{'RS':>2} {'olap':>4}  "
+        f"{'rS':>3} {'rP':>3} {'rO':>3} {'tot':>4}/{'':<3}  "
+        f"{'smem':>5}  {'traffic':>7}  {'tr/blk':>6}"
+    )
+    print(hdr)
+    print("-" * len(hdr))
+    B = lambda b: "T" if b else "F"
+    for c in configs[:max_results]:
+        print(
+            f"{c['num_wg']:>2} {c['tile_m']:>3} {c['tile_n']:>3}  "
+            f"{B(c['pv_is_rs']):>2} {B(c['overlap_wg']):>4}  "
+            f"{c['regs_S']:>3} {c['regs_P']:>3} {c['regs_O']:>3} "
+            f"{c['total_regs']:>4}/{c['reg_limit']:<3}  "
+            f"{c['smem_kb']:>4.0f}K  "
+            f"{c['smem_traffic_kb']:>6.0f}K  "
+            f"{c['smem_traffic_per_block']:>6.1f}"
+        )
+# ============================================================================
+# CLI
+# ============================================================================
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Search feasible SM90 MMA configs")
+    parser.add_argument("--mode", choices=["fwd", "bwd", "both"], default="both")
+    parser.add_argument(
+        "--headdim", type=str, default="128", help="Head dim, or hdim-hdimv (e.g. 192-128)"
+    )
+    parser.add_argument("--tile-m", type=str, default="64,80,96,112,128", help="Bwd tile_m choices")
+    parser.add_argument(
+        "--tile-n",
+        type=str,
+        default=None,
+        help="tile_n choices (default: fwd up to 192, bwd up to 128)",
+    )
+    parser.add_argument("-n", "--num-results", type=int, default=30)
+    args = parser.parse_args()
+    parts = args.headdim.split("-")
+    hdim = int(parts[0])
+    hdimv = int(parts[1]) if len(parts) > 1 else hdim
+    TN_FWD = "64,80,96,112,128,144,160,176,192"
+    TN_BWD = "64,80,96,112,128"
+    if args.mode in ("fwd", "both"):
+        tn = tuple(int(x) for x in (args.tile_n or TN_FWD).split(","))
+        print(f"=== FWD configs: hdim={hdim}, hdimv={hdimv} ===\n")
+        print_fwd_configs(find_feasible_fwd_configs(hdim, hdimv, tn), args.num_results)
+        print()
+    if args.mode in ("bwd", "both"):
+        tm = tuple(int(x) for x in args.tile_m.split(","))
+        tn = tuple(int(x) for x in (args.tile_n or TN_BWD).split(","))
+        print(f"=== BWD configs: hdim={hdim}, hdimv={hdimv} ===\n")
+        print_bwd_configs(find_feasible_bwd_configs(hdim, hdimv, tm, tn), args.num_results)

build/torch-cuda/softmax.py CHANGED Viewed

@@ -10,7 +10,7 @@ import cutlass.cute as cute
 from cutlass import Float32
 from .quack import layout_utils
-from . import utils
 from .quack.cute_dsl_utils import ParamsBase
 from .seqlen_info import SeqlenInfoQK

 from cutlass import Float32
 from .quack import layout_utils
+from . import utils as utils
 from .quack.cute_dsl_utils import ParamsBase
 from .seqlen_info import SeqlenInfoQK

build/torch-cuda/tile_scheduler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) 2025, Tri Dao.
-from typing import Optional, Tuple
 from dataclasses import dataclass
 try:
@@ -9,17 +10,80 @@ except ImportError:  # Python < 3.12
     from typing_extensions import override
 import cutlass
 from cutlass._mlir import ir
 import cutlass.cute as cute
 from cutlass import Int32, const_expr
 from cutlass.cute import FastDivmodDivisor
 from .quack.cute_dsl_utils import ParamsBase
-from . import utils
 from .fast_math import clz
 class WorkTileInfo(cutlass.utils.WorkTileInfo):
     """Altered WorkTileInfo which includes four axes: (block, head, batch, split)"""
@@ -31,6 +95,47 @@ class WorkTileInfo(cutlass.utils.WorkTileInfo):
         return WorkTileInfo(new_tile_idx, new_is_valid_tile)
 @dataclass
 class TileSchedulerArguments(ParamsBase):
     num_block: Int32
@@ -51,6 +156,7 @@ class TileSchedulerArguments(ParamsBase):
     lpt: cutlass.Constexpr[bool] = False
     is_split_kv: cutlass.Constexpr[bool] = False
     head_swizzle: cutlass.Constexpr[bool] = False
 class SingleTileScheduler:
@@ -63,6 +169,7 @@ class SingleTileScheduler:
         num_splits_divmod: FastDivmodDivisor
         is_split_kv: cutlass.Constexpr[bool] = False
         cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
         @staticmethod
         def create(
@@ -76,6 +183,7 @@ class SingleTileScheduler:
                 FastDivmodDivisor(args.num_splits),
                 args.is_split_kv,
                 args.cluster_shape_mn,
             )
     def __init__(self, params: Params, blk_coord: cute.Coord, *, loc=None, ip=None):
@@ -86,18 +194,26 @@ class SingleTileScheduler:
         self._ip = ip
     @staticmethod
-    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
         return SingleTileScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
-    def create(params: Params, *, loc=None, ip=None) -> "SingleTileScheduler":
-        # if const_expr(cute.size(params.cluster_shape_mn) == 1):
-        #     blk_coord = cute.arch.block_idx()
-        # else:
-        #     # All CTAs in a cluster must get the same block coordinate
-        #     blk_coord = cute.arch.cluster_idx()
-        # Temporary set to block_idx until we sort out the best way to handle cluster
-        blk_coord = cute.arch.block_idx()
         return SingleTileScheduler(params, blk_coord, loc=loc, ip=ip)
     # called by host
@@ -110,8 +226,13 @@ class SingleTileScheduler:
     ) -> Tuple[Int32, Int32, Int32]:
         # TODO: this hard-codes the fact that we only use cluster = (1, 1) or (2, 1)
         assert params.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
         return (
-            cute.round_up(params.num_block, params.cluster_shape_mn[0]),
             params.num_head * params.num_splits,
             params.num_batch,
         )
@@ -135,6 +256,10 @@ class SingleTileScheduler:
     def advance_to_next_work(self, *, loc=None, ip=None):
         self._is_first_block = False
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
@@ -180,18 +305,28 @@ class StaticPersistentTileScheduler:
         self._ip = ip
     @staticmethod
-    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
         return StaticPersistentTileScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
-    def create(params: Params, *, loc=None, ip=None) -> "StaticPersistentTileScheduler":
         if const_expr(cute.size(params.cluster_shape_m) == 1):
             tile_idx = cute.arch.block_idx()[0]
         else:
             tile_idx = cute.arch.cluster_idx()[0]
         return StaticPersistentTileScheduler(params, tile_idx, loc=loc, ip=ip)
-    # called by host
     @staticmethod
     def get_grid_shape(
         params: Params,
@@ -201,18 +336,14 @@ class StaticPersistentTileScheduler:
     ) -> Tuple[Int32, Int32, Int32]:
         hardware_info = cutlass.utils.HardwareInfo()
         sm_count = hardware_info.get_device_multiprocessor_count()
-        # Grid must be a multiple of cluster_shape_m for CUDA cluster launch.
         max_ctas = (sm_count // params.cluster_shape_m) * params.cluster_shape_m
         grid_x = cutlass.min(max_ctas, params.total_blocks_cluster * params.cluster_shape_m)
         return (grid_x, Int32(1), Int32(1))
-    # @cute.jit
     def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
         hn_idx, block_idx = divmod(self._tile_idx, self.params.num_block_cluster_divmod)
         batch_idx, head_idx = divmod(hn_idx, self.params.num_head_divmod)
         is_valid = self._tile_idx < self.params.total_blocks_cluster
-        # if cute.arch.thread_idx()[0] == 0:
-        #     cute.printf("TileScheduler: tile_idx=%d, hn_idx=%d, block_idx=%d, batch_idx=%d, head_idx=%d, is_valid=%d", self._tile_idx, hn_idx, block_idx, batch_idx, head_idx, is_valid)
         return WorkTileInfo(
             (Int32(block_idx), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid
         )
@@ -228,6 +359,10 @@ class StaticPersistentTileScheduler:
             self._tile_idx += cute.arch.grid_dim()[0]
         else:
             self._tile_idx += cute.arch.cluster_dim()[0]
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
@@ -254,32 +389,41 @@ class SingleTileLPTScheduler:
         total_blocks: Int32
         num_splits: Int32
         num_block: Int32
         l2_minor: Int32
-        num_block_divmod: FastDivmodDivisor
         num_head_divmod: FastDivmodDivisor
         l2_minor_divmod: FastDivmodDivisor
         l2_major_divmod: FastDivmodDivisor
         l2_minor_residual_divmod: FastDivmodDivisor
         num_hb_quotient: Int32
         is_split_kv: cutlass.Constexpr[bool] = False
         @staticmethod
         @cute.jit
         def create(
-            args: TileSchedulerArguments, *, loc=None, ip=None
         ) -> "SingleTileLPTScheduler.Params":
-            # cute.printf(args.num_block, args.num_head, args.num_batch, args.seqlen_k, args.headdim, args.headdim_v, args.total_q, args.tile_shape_mn, args.qhead_per_kvhead_packgqa, args.element_size)
             size_one_kv_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
             size_one_head = size_one_kv_head
             size_l2 = 50 * 1024 * 1024  # 40 MB for K & V
             # Swizzle is the size of each "section". Round swizzle to a power of 2
             # Need to be careful about the case where only one head will fit
             # swizzle is how many heads can fit in L2
-            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
-            # Seems faster if swizzle if a power of 2
             log2_floor = lambda n: 31 - clz(n)
             swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
-            # swizzle = 1 if size_l2 < size_one_head else (size_l2 // size_one_head)
             # If we're in the last section (called residual), we don't want to divide by
             # swizzle. Instead we want to divide by the remainder.
             num_hb_quotient = (args.num_head * args.num_batch) // swizzle
@@ -287,37 +431,84 @@ class SingleTileLPTScheduler:
             return SingleTileLPTScheduler.Params(
                 total_blocks=args.num_block * args.num_head * args.num_batch,
                 num_block=args.num_block,
                 l2_minor=Int32(swizzle),
-                num_block_divmod=FastDivmodDivisor(args.num_block),
                 num_head_divmod=FastDivmodDivisor(args.num_head),
                 l2_minor_divmod=FastDivmodDivisor(swizzle),
                 l2_major_divmod=FastDivmodDivisor(swizzle * args.num_block),
-                l2_minor_residual_divmod=FastDivmodDivisor(
-                    max(num_hb_remainder, 1)
-                ),  # don't divide by 0
                 num_hb_quotient=Int32(num_hb_quotient),
                 num_splits=args.num_splits,
                 is_split_kv=args.is_split_kv,
             )
-    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
         self.params = params
         self._tile_idx = tile_idx
         self._split_idx = split_idx
         self._loc = loc
         self._ip = ip
     @staticmethod
-    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
-        return SingleTileLPTScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
     @cute.jit
-    def create(params: Params, *, loc=None, ip=None) -> "SingleTileLPTScheduler":
         tile_idx, split_idx, _ = cute.arch.block_idx()
         return SingleTileLPTScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
-    # called by host
     @staticmethod
     def get_grid_shape(
         params: Params,
@@ -325,10 +516,40 @@ class SingleTileLPTScheduler:
         loc=None,
         ip=None,
     ) -> Tuple[Int32, Int32, Int32]:
         return (params.total_blocks, params.num_splits, Int32(1))
     @cute.jit
     def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
         params = self.params
         # Implement LPT scheduling coordinate calculation
         bidhb, l2_mod = divmod(self._tile_idx, params.l2_major_divmod)
@@ -342,25 +563,45 @@ class SingleTileLPTScheduler:
         bidhb_actual = bidhb * params.l2_minor + bidhb_residual
         batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
         # Longest-processing-time-first
-        block = params.num_block - 1 - block
         is_valid = self._tile_idx < params.total_blocks
         return WorkTileInfo(
             (Int32(block), Int32(head_idx), Int32(batch_idx), Int32(self._split_idx)), is_valid
         )
     def initial_work_tile_info(self, *, loc=None, ip=None):
         return self.get_current_work(loc=loc, ip=ip)
     def prefetch_next_work(self, *, loc=None, ip=None):
-        pass
     def advance_to_next_work(self, *, loc=None, ip=None):
         # Single tile scheduler - set to invalid tile_idx to indicate no more work
         self._tile_idx = self.params.total_blocks
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
-        for obj in [self.params, self._tile_idx, self._split_idx]:
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
@@ -368,10 +609,13 @@ class SingleTileLPTScheduler:
     def __new_from_mlir_values__(self, values):
         obj_list = []
-        for obj, n_items in zip([self.params, self._tile_idx, self._split_idx], self._values_pos):
             obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
             values = values[n_items:]
-        return self.__class__(*(tuple(obj_list)), loc=self._loc)
 class SingleTileLPTBwdScheduler:
@@ -395,8 +639,8 @@ class SingleTileLPTBwdScheduler:
         ) -> "SingleTileLPTBwdScheduler.Params":
             size_l2 = 50 * 1024 * 1024
             size_one_qdo_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
-            # size_one_dqaccum_head = args.seqlen_k * (args.headdim) * 4
-            size_one_dqaccum_head = 0
             size_one_head = size_one_qdo_head + size_one_dqaccum_head
             log2_floor = lambda n: 31 - clz(n)
             swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
@@ -430,7 +674,16 @@ class SingleTileLPTBwdScheduler:
         self._ip = ip
     @staticmethod
-    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
         return SingleTileLPTBwdScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
@@ -481,6 +734,7 @@ class SingleTileLPTBwdScheduler:
     def advance_to_next_work(self, *, loc=None, ip=None):
         # Single tile scheduler - set to invalid tile_idx to indicate no more work
         self._tile_idx = self.params.total_blocks
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
@@ -514,20 +768,38 @@ class SingleTileVarlenScheduler:
         is_split_kv: cutlass.Constexpr[bool] = False
         head_swizzle: cutlass.Constexpr[bool] = False
         cluster_shape_m: cutlass.Constexpr[int] = 1
         @staticmethod
         @cute.jit
         def create(
-            args: TileSchedulerArguments, *, loc=None, ip=None
         ) -> "SingleTileVarlenScheduler.Params":
             size_l2 = 50 * 1024 * 1024  # 50 MB for K & V
-            max_kvblock_in_l2 = size_l2 // (
                 (args.headdim + args.headdim_v) * args.element_size * args.tile_shape_mn[1]
             )
             assert args.mCuSeqlensQ is not None or args.mSeqUsedQ is not None, (
                 "At least one of mCuSeqlensQ or mSeqUsedQ must be provided"
             )
             assert args.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
             return SingleTileVarlenScheduler.Params(
                 num_head=args.num_head,
                 num_batch=args.num_batch,
@@ -542,22 +814,65 @@ class SingleTileVarlenScheduler:
                 is_split_kv=args.is_split_kv,
                 head_swizzle=args.head_swizzle,
                 cluster_shape_m=args.cluster_shape_mn[0],
             )
-    def __init__(self, params: Params, tile_idx: Int32, split_idx: Int32, *, loc=None, ip=None):
         self.params = params
         self._tile_idx = tile_idx
         self._split_idx = split_idx
         self._is_first_block = True
         self._loc = loc
         self._ip = ip
     @staticmethod
-    def to_underlying_arguments(args: TileSchedulerArguments, *, loc=None, ip=None) -> Params:
-        return SingleTileVarlenScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
-    def create(params: Params, *, loc=None, ip=None) -> "SingleTileVarlenScheduler":
         tile_idx, split_idx, _ = cute.arch.block_idx()
         return SingleTileVarlenScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
@@ -573,7 +888,7 @@ class SingleTileVarlenScheduler:
             params.total_q
             + params.num_batch * (params.cluster_shape_m * params.tile_shape_mn[0] - 1)
         ) // params.tile_shape_mn[0]
-        # round down to nearest multiple of cluster since odd excess is always padding
         total_blocks_max = total_blocks_max // params.cluster_shape_m * params.cluster_shape_m
         return (total_blocks_max * params.num_head, params.num_splits, Int32(1))
@@ -601,7 +916,8 @@ class SingleTileVarlenScheduler:
         )
     @cute.jit
-    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
         params = self.params
         lane_idx = cute.arch.lane_idx()
         num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=0)
@@ -654,6 +970,7 @@ class SingleTileVarlenScheduler:
                 num_n_blocks = (
                     num_m_blocks
                     * params.tile_shape_mn[0]
                     // params.qhead_per_kvhead_packgqa
                     // params.tile_shape_mn[1]
                 )
@@ -698,19 +1015,62 @@ class SingleTileVarlenScheduler:
         split_idx = self._split_idx if const_expr(params.is_split_kv) else Int32(0)
         return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), split_idx), is_valid)
     def initial_work_tile_info(self, *, loc=None, ip=None):
-        return self.get_current_work(loc=loc, ip=ip)
     def prefetch_next_work(self, *, loc=None, ip=None):
-        pass
     def advance_to_next_work(self, *, loc=None, ip=None):
-        # Single tile scheduler - set to invalid tile_idx to indicate no more work
         self._is_first_block = False
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
-        for obj in [self.params, self._tile_idx, self._split_idx]:
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
@@ -718,10 +1078,10 @@ class SingleTileVarlenScheduler:
     def __new_from_mlir_values__(self, values):
         obj_list = []
-        for obj, n_items in zip(
-            [self.params, self._tile_idx, self._split_idx],
-            self._values_pos,
-        ):
             obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
             values = values[n_items:]
-        return SingleTileVarlenScheduler(*(tuple(obj_list)), loc=self._loc)

 # Copyright (c) 2025, Tri Dao.
+from enum import IntEnum, auto
+from typing import Optional, Tuple, Protocol, runtime_checkable
 from dataclasses import dataclass
 try:
     from typing_extensions import override
 import cutlass
+from cutlass.pipeline import PipelineClcFetchAsync, PipelineState
 from cutlass._mlir import ir
 import cutlass.cute as cute
 from cutlass import Int32, const_expr
 from cutlass.cute import FastDivmodDivisor
+from cutlass.utils import ClcDynamicPersistentTileScheduler, ClcDynamicPersistentTileSchedulerParams
 from .quack.cute_dsl_utils import ParamsBase
+from . import utils as utils
 from .fast_math import clz
+class SchedulingMode(IntEnum):
+    NONE = auto()
+    STATIC = auto()
+    DYNAMIC = auto()
+    CLC = auto()
+@dataclass
+class ClcState(ParamsBase):
+    """Owns the runtime state shared by CLC-capable tile schedulers.
+    `FlashAttentionForwardSm100` constructs this state because it owns the CLC
+    response buffer, mbarrier storage, and launch geometry needed to initialize
+    the hardware scheduler and async pipeline. Individual tile schedulers then
+    consume this state and map the returned hardware work tiles into their own
+    logical `WorkTileInfo` coordinates.
+    To add CLC support to a scheduler:
+    - implement `clc_problem_shape(params)` so the kernel can create the hardware scheduler
+    - accept `clc: ClcState | None` in `create(...)` / `__init__`
+    - map `clc.initial_work_tile_info()` and `clc.get_current_work()` into scheduler coordinates
+    """
+    _hw_scheduler: ClcDynamicPersistentTileScheduler
+    _pipeline: PipelineClcFetchAsync
+    _consumer_state: PipelineState
+    _producer_state: PipelineState
+    @staticmethod
+    def create(
+        *,
+        hw_scheduler: ClcDynamicPersistentTileScheduler,
+        pipeline: PipelineClcFetchAsync,
+        consumer_state: PipelineState,
+        producer_state: PipelineState,
+    ) -> "ClcState":
+        return ClcState(hw_scheduler, pipeline, consumer_state, producer_state)
+    def initial_work_tile_info(self):
+        return self._hw_scheduler.initial_work_tile_info()
+    def get_current_work(self):
+        return self._hw_scheduler.get_current_work()
+    def prefetch_next_work(self, *, loc=None, ip=None):
+        self._pipeline.producer_acquire(self._producer_state, loc=loc, ip=ip)
+        mbarrier_addr = self._pipeline.producer_get_barrier(self._producer_state, loc=loc, ip=ip)
+        self._hw_scheduler.advance_to_next_work(mbarrier_addr, loc=loc, ip=ip)
+        self._producer_state.advance(loc=loc, ip=ip)
+    def consumer_wait(self, *, loc=None, ip=None):
+        self._pipeline.consumer_wait(self._consumer_state, loc=loc, ip=ip)
+    def consumer_release(self, *, loc=None, ip=None):
+        self._pipeline.consumer_release(self._consumer_state, loc=loc, ip=ip)
+        self._consumer_state.advance(loc=loc, ip=ip)
+    def producer_tail(self, *, loc=None, ip=None):
+        self._pipeline.producer_tail(self._producer_state, loc=loc, ip=ip)
 class WorkTileInfo(cutlass.utils.WorkTileInfo):
     """Altered WorkTileInfo which includes four axes: (block, head, batch, split)"""
         return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+@runtime_checkable
+class TileSchedulerProtocol(Protocol):
+    """Protocol defining the interface all tile schedulers must implement.
+    Schedulers are responsible for:
+    1. Coordinate mapping: linear tile index -> (m_block, head, batch, split)
+    2. Work distribution: how to get the next tile (static grid-stride vs CLC dynamic)
+    """
+    def get_current_work(self) -> WorkTileInfo:
+        """Get the current work tile coordinates."""
+        ...
+    def initial_work_tile_info(self) -> WorkTileInfo:
+        """Get the initial work tile for this CTA."""
+        ...
+    def advance_to_next_work(self, *, loc=None, ip=None):
+        """Consumer-side advance: move to next tile and return it.
+        For static schedulers: grid-stride increment + get_current_work.
+        For CLC schedulers: consumer wait + get_current_work + consumer release + state advance.
+        """
+        ...
+    def prefetch_next_work(self, *, loc=None, ip=None) -> None:
+        """Producer-side prefetch of next work tile (no-op for static schedulers).
+        For CLC schedulers: producer acquire + issue CLC query + producer state advance.
+        Only called by the scheduler warp.
+        """
+        ...
+    def producer_tail(self, *, loc=None, ip=None) -> None:
+        """Producer-side cleanup after the last tile.
+        No-op for static schedulers. For CLC schedulers: pipeline producer_tail.
+        """
+        ...
 @dataclass
 class TileSchedulerArguments(ParamsBase):
     num_block: Int32
     lpt: cutlass.Constexpr[bool] = False
     is_split_kv: cutlass.Constexpr[bool] = False
     head_swizzle: cutlass.Constexpr[bool] = False
+    use_cluster_idx: cutlass.Constexpr[bool] = False
 class SingleTileScheduler:
         num_splits_divmod: FastDivmodDivisor
         is_split_kv: cutlass.Constexpr[bool] = False
         cluster_shape_mn: cutlass.Constexpr[Tuple[int, int]] = (1, 1)
+        use_cluster_idx: cutlass.Constexpr[bool] = False
         @staticmethod
         def create(
                 FastDivmodDivisor(args.num_splits),
                 args.is_split_kv,
                 args.cluster_shape_mn,
+                args.use_cluster_idx,
             )
     def __init__(self, params: Params, blk_coord: cute.Coord, *, loc=None, ip=None):
         self._ip = ip
     @staticmethod
+    def to_underlying_arguments(
+        args: TileSchedulerArguments,
+        *,
+        scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+        loc=None,
+        ip=None,
+    ) -> Params:
+        assert scheduling_mode == SchedulingMode.STATIC, (
+            f"SingleTileScheduler only supports STATIC, got {scheduling_mode!r}"
+        )
         return SingleTileScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
+    def create(
+        params: Params, clc: ClcState | None = None, *, loc=None, ip=None
+    ) -> "SingleTileScheduler":
+        if const_expr(cute.size(params.cluster_shape_mn) == 1 or not params.use_cluster_idx):
+            blk_coord = cute.arch.block_idx()
+        else:
+            blk_coord = cute.arch.cluster_idx()
         return SingleTileScheduler(params, blk_coord, loc=loc, ip=ip)
     # called by host
     ) -> Tuple[Int32, Int32, Int32]:
         # TODO: this hard-codes the fact that we only use cluster = (1, 1) or (2, 1)
         assert params.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
+        if const_expr(params.use_cluster_idx):
+            # Grid must have num_block * cluster_m physical blocks so that there are num_block clusters
+            grid_x = params.num_block * params.cluster_shape_mn[0]
+        else:
+            grid_x = cute.round_up(params.num_block, params.cluster_shape_mn[0])
         return (
+            grid_x,
             params.num_head * params.num_splits,
             params.num_batch,
         )
     def advance_to_next_work(self, *, loc=None, ip=None):
         self._is_first_block = False
+        return self.get_current_work()
+    def producer_tail(self, *, loc=None, ip=None):
+        pass
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
         self._ip = ip
     @staticmethod
+    def to_underlying_arguments(
+        args: TileSchedulerArguments,
+        *,
+        scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+        loc=None,
+        ip=None,
+    ) -> Params:
+        assert scheduling_mode == SchedulingMode.STATIC, (
+            f"StaticPersistentTileScheduler only supports STATIC, got {scheduling_mode!r}"
+        )
         return StaticPersistentTileScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
+    def create(
+        params: Params, clc: ClcState | None = None, *, loc=None, ip=None
+    ) -> "StaticPersistentTileScheduler":
         if const_expr(cute.size(params.cluster_shape_m) == 1):
             tile_idx = cute.arch.block_idx()[0]
         else:
             tile_idx = cute.arch.cluster_idx()[0]
         return StaticPersistentTileScheduler(params, tile_idx, loc=loc, ip=ip)
     @staticmethod
     def get_grid_shape(
         params: Params,
     ) -> Tuple[Int32, Int32, Int32]:
         hardware_info = cutlass.utils.HardwareInfo()
         sm_count = hardware_info.get_device_multiprocessor_count()
         max_ctas = (sm_count // params.cluster_shape_m) * params.cluster_shape_m
         grid_x = cutlass.min(max_ctas, params.total_blocks_cluster * params.cluster_shape_m)
         return (grid_x, Int32(1), Int32(1))
     def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
         hn_idx, block_idx = divmod(self._tile_idx, self.params.num_block_cluster_divmod)
         batch_idx, head_idx = divmod(hn_idx, self.params.num_head_divmod)
         is_valid = self._tile_idx < self.params.total_blocks_cluster
         return WorkTileInfo(
             (Int32(block_idx), Int32(head_idx), Int32(batch_idx), Int32(0)), is_valid
         )
             self._tile_idx += cute.arch.grid_dim()[0]
         else:
             self._tile_idx += cute.arch.cluster_dim()[0]
+        return self.get_current_work()
+    def producer_tail(self, *, loc=None, ip=None):
+        pass
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
         total_blocks: Int32
         num_splits: Int32
         num_block: Int32
+        num_head: Int32
+        num_batch: Int32
         l2_minor: Int32
         num_head_divmod: FastDivmodDivisor
         l2_minor_divmod: FastDivmodDivisor
         l2_major_divmod: FastDivmodDivisor
         l2_minor_residual_divmod: FastDivmodDivisor
         num_hb_quotient: Int32
+        num_splits_divmod: FastDivmodDivisor
         is_split_kv: cutlass.Constexpr[bool] = False
+        cluster_shape_m: cutlass.Constexpr[int] = 1
+        scheduling_mode: cutlass.Constexpr[SchedulingMode] = SchedulingMode.STATIC
+        lpt: cutlass.Constexpr[bool] = True
         @staticmethod
         @cute.jit
         def create(
+            args: TileSchedulerArguments,
+            *,
+            scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+            loc=None,
+            ip=None,
         ) -> "SingleTileLPTScheduler.Params":
+            assert scheduling_mode in (SchedulingMode.STATIC, SchedulingMode.CLC), (
+                f"Only STATIC and CLC are supported, got {scheduling_mode!r}"
+            )
             size_one_kv_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
             size_one_head = size_one_kv_head
             size_l2 = 50 * 1024 * 1024  # 40 MB for K & V
             # Swizzle is the size of each "section". Round swizzle to a power of 2
             # Need to be careful about the case where only one head will fit
             # swizzle is how many heads can fit in L2
+            # Seems faster if swizzle is a power of 2
             log2_floor = lambda n: 31 - clz(n)
             swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
             # If we're in the last section (called residual), we don't want to divide by
             # swizzle. Instead we want to divide by the remainder.
             num_hb_quotient = (args.num_head * args.num_batch) // swizzle
             return SingleTileLPTScheduler.Params(
                 total_blocks=args.num_block * args.num_head * args.num_batch,
                 num_block=args.num_block,
+                num_head=args.num_head,
+                num_batch=args.num_batch,
                 l2_minor=Int32(swizzle),
                 num_head_divmod=FastDivmodDivisor(args.num_head),
                 l2_minor_divmod=FastDivmodDivisor(swizzle),
                 l2_major_divmod=FastDivmodDivisor(swizzle * args.num_block),
+                l2_minor_residual_divmod=FastDivmodDivisor(max(num_hb_remainder, 1)),
                 num_hb_quotient=Int32(num_hb_quotient),
                 num_splits=args.num_splits,
+                num_splits_divmod=FastDivmodDivisor(args.num_splits),
                 is_split_kv=args.is_split_kv,
+                cluster_shape_m=args.cluster_shape_mn[0],
+                scheduling_mode=scheduling_mode,
+                lpt=args.lpt,
             )
+    def __init__(
+        self,
+        params: Params,
+        tile_idx: Int32,
+        split_idx: Int32,
+        clc: ClcState | None = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
         self.params = params
         self._tile_idx = tile_idx
         self._split_idx = split_idx
+        self.clc = clc
         self._loc = loc
         self._ip = ip
     @staticmethod
+    def to_underlying_arguments(
+        args: TileSchedulerArguments,
+        *,
+        scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+        loc=None,
+        ip=None,
+    ) -> Params:
+        return SingleTileLPTScheduler.Params.create(
+            args, scheduling_mode=scheduling_mode, loc=loc, ip=ip
+        )
+    @staticmethod
+    def _clc_grid_shape(params: Params):
+        num_batch_splits = (
+            params.num_batch * params.num_splits
+            if const_expr(params.is_split_kv)
+            else params.num_batch
+        )
+        return (
+            cute.round_up(params.num_block, params.cluster_shape_m),
+            params.num_head,
+            num_batch_splits,
+        )
     @staticmethod
     @cute.jit
+    def clc_problem_shape(params: Params):
+        return ClcDynamicPersistentTileSchedulerParams(
+            problem_shape_ntile_mnl=SingleTileLPTScheduler._clc_grid_shape(params),
+            cluster_shape_mnk=(params.cluster_shape_m, 1, 1),
+        )
+    @staticmethod
+    @cute.jit
+    def create(
+        params: Params, clc: ClcState | None = None, *, loc=None, ip=None
+    ) -> "SingleTileLPTScheduler":
+        if const_expr(params.scheduling_mode == SchedulingMode.CLC):
+            return SingleTileLPTScheduler(
+                params, cute.arch.block_idx()[0], Int32(0), clc, loc=loc, ip=ip
+            )
         tile_idx, split_idx, _ = cute.arch.block_idx()
         return SingleTileLPTScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
     @staticmethod
     def get_grid_shape(
         params: Params,
         loc=None,
         ip=None,
     ) -> Tuple[Int32, Int32, Int32]:
+        if const_expr(params.scheduling_mode == SchedulingMode.CLC):
+            return SingleTileLPTScheduler._clc_grid_shape(params)
         return (params.total_blocks, params.num_splits, Int32(1))
+    @cute.jit
+    def clc_work_to_coords(self, work) -> WorkTileInfo:
+        """Convert CLC response (block, head, batch_split) to WorkTileInfo.
+        CLC returns raw grid coordinates — no L2 swizzle (hardware decides order).
+        We only apply cluster division, optional LPT block reversal, and split_kv unpacking.
+        """
+        block_idx = work.tile_idx[0]
+        if const_expr(self.params.cluster_shape_m > 1):
+            block_idx = block_idx // self.params.cluster_shape_m
+        if const_expr(self.params.lpt):
+            # Longest-processing-time-first: reverse block order
+            block_idx = self.params.num_block - 1 - block_idx
+        split_idx = Int32(0)
+        if const_expr(self.params.is_split_kv):
+            batch_idx, split_idx = divmod(work.tile_idx[2], self.params.num_splits_divmod)
+        else:
+            batch_idx = work.tile_idx[2]
+        return WorkTileInfo(
+            (Int32(block_idx), Int32(work.tile_idx[1]), Int32(batch_idx), Int32(split_idx)),
+            work.is_valid_tile,
+        )
     @cute.jit
     def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            work = self.clc.get_current_work()
+            self._tile_idx = work.tile_idx[0]
+            return self.clc_work_to_coords(work)
+        # Static path: L2-swizzled coordinate mapping
         params = self.params
         # Implement LPT scheduling coordinate calculation
         bidhb, l2_mod = divmod(self._tile_idx, params.l2_major_divmod)
         bidhb_actual = bidhb * params.l2_minor + bidhb_residual
         batch_idx, head_idx = divmod(bidhb_actual, params.num_head_divmod)
         # Longest-processing-time-first
+        if const_expr(params.lpt):
+            block = params.num_block - 1 - block
         is_valid = self._tile_idx < params.total_blocks
         return WorkTileInfo(
             (Int32(block), Int32(head_idx), Int32(batch_idx), Int32(self._split_idx)), is_valid
         )
+    @cute.jit
     def initial_work_tile_info(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            work = self.clc.initial_work_tile_info()
+            self._tile_idx = work.tile_idx[0]
+            return self.clc_work_to_coords(work)
         return self.get_current_work(loc=loc, ip=ip)
     def prefetch_next_work(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.prefetch_next_work(loc=loc, ip=ip)
     def advance_to_next_work(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.consumer_wait(loc=loc, ip=ip)
+            work = self.get_current_work()
+            self.clc.consumer_release(loc=loc, ip=ip)
+            return work
         # Single tile scheduler - set to invalid tile_idx to indicate no more work
         self._tile_idx = self.params.total_blocks
+        return self.get_current_work()
+    def producer_tail(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.producer_tail(loc=loc, ip=ip)
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
+        objs = [self.params, self._tile_idx, self._split_idx]
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            objs += [self.clc]
+        for obj in objs:
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
     def __new_from_mlir_values__(self, values):
         obj_list = []
+        objs = [self.params, self._tile_idx, self._split_idx]
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            objs += [self.clc]
+        for obj, n_items in zip(objs, self._values_pos):
             obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
             values = values[n_items:]
+        return self.__class__(*obj_list, loc=self._loc)
 class SingleTileLPTBwdScheduler:
         ) -> "SingleTileLPTBwdScheduler.Params":
             size_l2 = 50 * 1024 * 1024
             size_one_qdo_head = args.seqlen_k * (args.headdim + args.headdim_v) * args.element_size
+            size_one_dqaccum_head = args.seqlen_k * (args.headdim) * 4
+            # size_one_dqaccum_head = 0
             size_one_head = size_one_qdo_head + size_one_dqaccum_head
             log2_floor = lambda n: 31 - clz(n)
             swizzle = 1 if size_l2 < size_one_head else (1 << log2_floor(size_l2 // size_one_head))
         self._ip = ip
     @staticmethod
+    def to_underlying_arguments(
+        args: TileSchedulerArguments,
+        *,
+        scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+        loc=None,
+        ip=None,
+    ) -> Params:
+        assert scheduling_mode == SchedulingMode.STATIC, (
+            f"SingleTileLPTBwdScheduler only supports STATIC, got {scheduling_mode!r}"
+        )
         return SingleTileLPTBwdScheduler.Params.create(args, loc=loc, ip=ip)
     @staticmethod
     def advance_to_next_work(self, *, loc=None, ip=None):
         # Single tile scheduler - set to invalid tile_idx to indicate no more work
         self._tile_idx = self.params.total_blocks
+        return self.get_current_work()
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
         is_split_kv: cutlass.Constexpr[bool] = False
         head_swizzle: cutlass.Constexpr[bool] = False
         cluster_shape_m: cutlass.Constexpr[int] = 1
+        scheduling_mode: cutlass.Constexpr[SchedulingMode] = SchedulingMode.STATIC
         @staticmethod
         @cute.jit
         def create(
+            args: TileSchedulerArguments,
+            *,
+            scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+            loc=None,
+            ip=None,
         ) -> "SingleTileVarlenScheduler.Params":
+            assert scheduling_mode in (SchedulingMode.STATIC, SchedulingMode.CLC), (
+                f"Only STATIC and CLC are supported, got {scheduling_mode!r}"
+            )
             size_l2 = 50 * 1024 * 1024  # 50 MB for K & V
+            # if backward, this is qdo block size
+            kv_block_size = (
                 (args.headdim + args.headdim_v) * args.element_size * args.tile_shape_mn[1]
             )
+            # if backward, add dqaccum block size to calculate swizzle
+            if args.head_swizzle:
+                kv_block_size += args.headdim * 4 * args.tile_shape_mn[1]
+            max_kvblock_in_l2 = size_l2 // kv_block_size
             assert args.mCuSeqlensQ is not None or args.mSeqUsedQ is not None, (
                 "At least one of mCuSeqlensQ or mSeqUsedQ must be provided"
             )
             assert args.cluster_shape_mn[1] == 1, "Only cluster_shape_mn[1] == 1 is supported"
+            # TODO: Support varlen CLC with cluster_shape_m > 1 by refactoring the
+            # flattened-tile decode so cluster unpacking semantics are explicit.
+            assert scheduling_mode != SchedulingMode.CLC or args.cluster_shape_mn[0] == 1, (
+                "Varlen CLC currently requires cluster_shape_mn[0] == 1"
+            )
             return SingleTileVarlenScheduler.Params(
                 num_head=args.num_head,
                 num_batch=args.num_batch,
                 is_split_kv=args.is_split_kv,
                 head_swizzle=args.head_swizzle,
                 cluster_shape_m=args.cluster_shape_mn[0],
+                scheduling_mode=scheduling_mode,
             )
+    def __init__(
+        self,
+        params: Params,
+        tile_idx: Int32,
+        split_idx: Int32,
+        clc: ClcState | None = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
         self.params = params
         self._tile_idx = tile_idx
         self._split_idx = split_idx
         self._is_first_block = True
+        self.clc = clc
         self._loc = loc
         self._ip = ip
     @staticmethod
+    def to_underlying_arguments(
+        args: TileSchedulerArguments,
+        *,
+        scheduling_mode: SchedulingMode = SchedulingMode.STATIC,
+        loc=None,
+        ip=None,
+    ) -> Params:
+        return SingleTileVarlenScheduler.Params.create(
+            args, scheduling_mode=scheduling_mode, loc=loc, ip=ip
+        )
     @staticmethod
+    @cute.jit
+    def clc_problem_shape(params: Params):
+        return ClcDynamicPersistentTileSchedulerParams(
+            problem_shape_ntile_mnl=SingleTileVarlenScheduler.get_grid_shape(params),
+            cluster_shape_mnk=(1, 1, 1),
+        )
+    @staticmethod
+    @cute.jit
+    def create(
+        params: Params, clc: ClcState | None = None, *, loc=None, ip=None
+    ) -> "SingleTileVarlenScheduler":
+        if const_expr(params.scheduling_mode == SchedulingMode.CLC):
+            block_idx = cute.arch.block_idx()
+            split_idx = Int32(0)
+            if const_expr(params.is_split_kv):
+                split_idx = block_idx[1]
+            return SingleTileVarlenScheduler(
+                params,
+                block_idx[0],
+                split_idx,
+                clc,
+                loc=loc,
+                ip=ip,
+            )
         tile_idx, split_idx, _ = cute.arch.block_idx()
         return SingleTileVarlenScheduler(params, tile_idx, split_idx, loc=loc, ip=ip)
             params.total_q
             + params.num_batch * (params.cluster_shape_m * params.tile_shape_mn[0] - 1)
         ) // params.tile_shape_mn[0]
+        # Round down to nearest multiple of cluster since odd excess is always padding.
         total_blocks_max = total_blocks_max // params.cluster_shape_m * params.cluster_shape_m
         return (total_blocks_max * params.num_head, params.num_splits, Int32(1))
         )
     @cute.jit
+    def _varlen_coord_map(self) -> WorkTileInfo:
+        """Map self._tile_idx to (block, head, batch) via warp-level prefix sums."""
         params = self.params
         lane_idx = cute.arch.lane_idx()
         num_m_blocks = self._get_num_m_blocks(lane_idx, bidb_start=0)
                 num_n_blocks = (
                     num_m_blocks
                     * params.tile_shape_mn[0]
+                    * params.cluster_shape_m
                     // params.qhead_per_kvhead_packgqa
                     // params.tile_shape_mn[1]
                 )
         split_idx = self._split_idx if const_expr(params.is_split_kv) else Int32(0)
         return WorkTileInfo((Int32(block), Int32(head_idx), Int32(batch_idx), split_idx), is_valid)
+    @cute.jit
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            clc_work = self.clc.get_current_work()
+            # Default to grid_dim (one past last valid flat index) so _varlen_coord_map
+            # returns is_valid=False when CLC is exhausted. CLC tile_idx is garbage when
+            # invalid, so we can't trust it. Local-then-assign avoids CuTe DSL structural
+            # mismatch on self inside the runtime if.
+            new_tile_idx = cute.arch.grid_dim()[0]
+            new_split_idx = Int32(0)
+            if clc_work.is_valid_tile:
+                new_tile_idx = clc_work.tile_idx[0]
+                if const_expr(self.params.is_split_kv):
+                    new_split_idx = clc_work.tile_idx[1]
+            self._tile_idx = new_tile_idx
+            self._split_idx = new_split_idx
+        return self._varlen_coord_map()
+    @cute.jit
     def initial_work_tile_info(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            clc_work = self.clc.initial_work_tile_info()
+            # See get_current_work for why grid_dim and local-then-assign.
+            new_tile_idx = cute.arch.grid_dim()[0]
+            new_split_idx = Int32(0)
+            if clc_work.is_valid_tile:
+                new_tile_idx = clc_work.tile_idx[0]
+                if const_expr(self.params.is_split_kv):
+                    new_split_idx = clc_work.tile_idx[1]
+            self._tile_idx = new_tile_idx
+            self._split_idx = new_split_idx
+        return self._varlen_coord_map()
     def prefetch_next_work(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.prefetch_next_work(loc=loc, ip=ip)
     def advance_to_next_work(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.consumer_wait(loc=loc, ip=ip)
+            work = self.get_current_work()
+            self.clc.consumer_release(loc=loc, ip=ip)
+            return work
         self._is_first_block = False
+        return self.get_current_work()
+    def producer_tail(self, *, loc=None, ip=None):
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            self.clc.producer_tail(loc=loc, ip=ip)
     def __extract_mlir_values__(self):
         values, self._values_pos = [], []
+        objs = [self.params, self._tile_idx, self._split_idx]
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            objs += [self.clc]
+        for obj in objs:
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
     def __new_from_mlir_values__(self, values):
         obj_list = []
+        objs = [self.params, self._tile_idx, self._split_idx]
+        if const_expr(self.params.scheduling_mode == SchedulingMode.CLC):
+            objs += [self.clc]
+        for obj, n_items in zip(objs, self._values_pos):
             obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items]))
             values = values[n_items:]
+        return self.__class__(*obj_list, loc=self._loc)

build/torch-cuda/utils.py CHANGED Viewed

@@ -3,12 +3,14 @@
 import math
 import hashlib
 import inspect
 from typing import Type, Callable, Optional, Tuple, overload
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32, const_expr
 from cutlass.cutlass_dsl import T, dsl_user_op
 from cutlass._mlir.dialects import nvvm, llvm
 from cutlass.cute.runtime import from_dlpack
@@ -54,6 +56,17 @@ POLY_EX2 = {
     ),
 }
 def _compute_base_hash(func: Callable) -> str:
     """Compute hash from source code or bytecode and closure values."""
@@ -123,6 +136,40 @@ def create_softcap_scoremod(softcap_val):
     return scoremod_premask_fn
 def convert_from_dlpack(x, leading_dim, alignment=16, divisibility=1) -> cute.Tensor:
     return (
         from_dlpack(x, assumed_align=alignment)
@@ -215,6 +262,21 @@ def warp_reduce(
     return val
 @dsl_user_op
 def fmax(
     a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None
@@ -429,8 +491,48 @@ def shuffle_sync(
     return val[0]
 @dsl_user_op
 def shr_u32(val: cutlass.Uint32, shift: cutlass.Uint32, *, loc=None, ip=None) -> cutlass.Uint32:
     return cutlass.Uint32(
         llvm.inline_asm(
             T.i32(),
@@ -438,7 +540,7 @@ def shr_u32(val: cutlass.Uint32, shift: cutlass.Uint32, *, loc=None, ip=None) ->
                 cutlass.Uint32(val).ir_value(loc=loc, ip=ip),
                 cutlass.Uint32(shift).ir_value(loc=loc, ip=ip),
             ],
-            "shr.s32 $0, $1, $2;",
             "=r,r,r",
             has_side_effects=False,
             is_align_stack=False,

 import math
 import hashlib
 import inspect
+import os
 from typing import Type, Callable, Optional, Tuple, overload
 import cutlass
 import cutlass.cute as cute
+from cutlass import Float32, Int32, const_expr
+from cutlass.cute import FastDivmodDivisor
 from cutlass.cutlass_dsl import T, dsl_user_op
 from cutlass._mlir.dialects import nvvm, llvm
 from cutlass.cute.runtime import from_dlpack
     ),
 }
+_fa_clc_enabled: bool = os.environ.get("FA_CLC", "0") == "1"
+_fa_disable_2cta_enabled: bool = os.environ.get("FA_DISABLE_2CTA", "0") == "1"
+def _get_use_clc_scheduler_default() -> bool:
+    return _fa_clc_enabled
+def _get_disable_2cta_default() -> bool:
+    return _fa_disable_2cta_enabled
 def _compute_base_hash(func: Callable) -> str:
     """Compute hash from source code or bytecode and closure values."""
     return scoremod_premask_fn
+LOG2_E = math.log2(math.e)
+def compute_softmax_scale_log2(softmax_scale, score_mod):
+    """Compute softmax_scale_log2 and adjusted softmax_scale based on whether score_mod is used.
+    When score_mod is None, fold the log2(e) factor into softmax_scale_log2 and set softmax_scale
+    to None. When score_mod is present, keep softmax_scale separate so it can be applied before
+    the score_mod, and set softmax_scale_log2 to just the change-of-base constant.
+    Returns (softmax_scale_log2, softmax_scale).
+    """
+    if const_expr(score_mod is None):
+        return softmax_scale * LOG2_E, None
+    else:
+        return LOG2_E, softmax_scale
+def compute_fastdiv_mods(mQ, mK, qhead_per_kvhead, pack_gqa, aux_tensors, mPageTable=None):
+    """Compute FastDivmodDivisor pairs for aux_tensors index computation.
+    Returns a (seqlen_q_divmod, seqlen_k_divmod) tuple, or None if aux_tensors is None.
+    """
+    if const_expr(aux_tensors is None):
+        return None
+    seqlen_q = cute.size(mQ.shape[0]) // (qhead_per_kvhead if const_expr(pack_gqa) else 1)
+    seqlen_k = (
+        cute.size(mK.shape[0])
+        if const_expr(mPageTable is None)
+        else mK.shape[0] * mPageTable.shape[1]
+    )
+    return (FastDivmodDivisor(seqlen_q), FastDivmodDivisor(seqlen_k))
 def convert_from_dlpack(x, leading_dim, alignment=16, divisibility=1) -> cute.Tensor:
     return (
         from_dlpack(x, assumed_align=alignment)
     return val
+@dsl_user_op
+def smid(*, loc=None, ip=None) -> Int32:
+    return Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [],
+            "mov.u32 $0, %smid;",
+            "=r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
 @dsl_user_op
 def fmax(
     a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None
     return val[0]
+@dsl_user_op
+def shl_u32(val: cutlass.Uint32, shift: cutlass.Uint32, *, loc=None, ip=None) -> cutlass.Uint32:
+    """
+    Left-shift val by shift bits using PTX shl.b32 (sign-agnostic).
+    Named ``shl_u32`` (not ``shl_b32``) because python type annotations
+    distinguish signed/unsigned.
+    PTX semantics (§9.7.8.8): "Shift amounts greater than the register width N
+    are clamped to N."  So ``shl.b32 d, a, 32`` is well-defined and yields 0.
+    This differs from C/C++ and LLVM IR, where shifting by >= the type width is
+    undefined behavior.  CuTeDSL compiles through MLIR -> LLVM IR, so a plain
+    Python-level ``Uint32(x) << Uint32(n)`` inherits LLVM's UB: the optimizer
+    may treat the result as poison and eliminate dependent code.  Inline PTX
+    bypasses the LLVM IR shift entirely — the instruction is emitted verbatim
+    into PTX where clamping makes it safe for all shift amounts.
+    """
+    return cutlass.Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [
+                cutlass.Uint32(val).ir_value(loc=loc, ip=ip),
+                cutlass.Uint32(shift).ir_value(loc=loc, ip=ip),
+            ],
+            "shl.b32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
 @dsl_user_op
 def shr_u32(val: cutlass.Uint32, shift: cutlass.Uint32, *, loc=None, ip=None) -> cutlass.Uint32:
+    """
+    Unsigned right-shift val by shift bits using PTX shr.u32 (zero-fills).
+    See ``shl_u32`` docstring for why inline PTX is used instead of plain
+    CuTeDSL shift operators (LLVM shift-by-type-width UB).
+    """
     return cutlass.Uint32(
         llvm.inline_asm(
             T.i32(),
                 cutlass.Uint32(val).ir_value(loc=loc, ip=ip),
                 cutlass.Uint32(shift).ir_value(loc=loc, ip=ip),
             ],
+            "shr.u32 $0, $1, $2;",
             "=r,r,r",
             has_side_effects=False,
             is_align_stack=False,