Kernels:

kernels-community
/

sonic-moe

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on Apr 22

Commit

2fe9f62

verified ·

1 Parent(s): 8b442f4

Uploaded using `kernel-builder`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch-cuda/__init__.py +2 -10
build/torch-cuda/_ops.py +33 -3
build/torch-cuda/functional/__init__.py +171 -218
build/torch-cuda/functional/backward.py +249 -308
build/torch-cuda/functional/forward.py +72 -120
build/torch-cuda/functional/grouped_gemm.py +0 -0
build/torch-cuda/functional/moe_config.py +0 -581
build/torch-cuda/functional/reduction_over_k_gather.py +0 -3
build/torch-cuda/functional/{topk_softmax.py → topk.py} +158 -13
build/torch-cuda/functional/utils.py +0 -25
build/torch-cuda/metadata.json +2 -0
build/torch-cuda/quack/__init__.py +2 -2
build/torch-cuda/quack/_compile_worker.py +102 -0
build/torch-cuda/quack/activation.py +108 -65
build/torch-cuda/quack/autotuner.py +184 -3
build/torch-cuda/quack/blockscaled_gemm_utils.py +752 -0
build/torch-cuda/quack/broadcast_utils.py +1 -1
build/torch-cuda/quack/cache_utils.py +195 -0
build/torch-cuda/quack/copy_utils.py +635 -66
build/torch-cuda/quack/cross_entropy.py +716 -0
build/torch-cuda/quack/cute_dsl_ptxas.py +105 -19
build/torch-cuda/quack/cute_dsl_utils.py +124 -52
build/torch-cuda/quack/epi_composable.py +187 -0
build/torch-cuda/quack/epi_ops.py +648 -0
build/torch-cuda/quack/epi_utils.py +64 -0
build/torch-cuda/quack/fast_math.py +29 -76
build/torch-cuda/quack/gemm.py +225 -137
build/torch-cuda/quack/gemm_act.py +396 -387
build/torch-cuda/quack/gemm_blockscaled_interface.py +326 -0
build/torch-cuda/quack/gemm_config.py +131 -72
build/torch-cuda/quack/gemm_dact.py +417 -124
build/torch-cuda/quack/gemm_default_epi.py +57 -204
build/torch-cuda/quack/gemm_interface.py +1318 -200
build/torch-cuda/quack/gemm_norm_act.py +400 -0
build/torch-cuda/quack/gemm_sm100.py +0 -0
build/torch-cuda/quack/gemm_sm120.py +626 -0
build/torch-cuda/quack/gemm_sm90.py +316 -355
build/torch-cuda/quack/gemm_sq_reduce.py +259 -0
build/torch-cuda/quack/gemm_symmetric.py +236 -172
build/torch-cuda/quack/gemm_tvm_ffi_utils.py +229 -0
build/torch-cuda/quack/gemm_wrapper_utils.py +0 -317
build/torch-cuda/quack/layout_utils.py +117 -28
build/torch-cuda/quack/linear.py +368 -0
build/torch-cuda/quack/linear_cross_entropy.py +275 -0
build/torch-cuda/quack/mlp.py +331 -0
build/torch-cuda/quack/mx_utils.py +269 -0
build/torch-cuda/quack/nvmmh_heuristic.py +172 -0
build/torch-cuda/quack/pipeline.py +395 -100
build/torch-cuda/quack/reduce.py +2 -2
build/torch-cuda/quack/rms_final_reduce.py +181 -0

build/torch-cuda/__init__.py CHANGED Viewed

@@ -2,23 +2,15 @@
 # Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
 # ********************************************************************************
-from functools import lru_cache
-__version__ = "0.1.1"
 from .enums import KernelBackendMoE
 from .moe import MoE
-from .functional import (
-    enable_quack_gemm,
-    moe_general_routing_inputs,
-    moe_TC_softmax_topk_layer,
-)
 __all__ = [
     "KernelBackendMoE",
     "MoE",
-    "enable_quack_gemm",
     "moe_general_routing_inputs",
     "moe_TC_softmax_topk_layer",
 ]

 # Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
 # ********************************************************************************
+__version__ = "0.1.2.post1"
 from .enums import KernelBackendMoE
+from .functional import moe_general_routing_inputs, moe_TC_softmax_topk_layer
 from .moe import MoE
 __all__ = [
     "KernelBackendMoE",
     "MoE",
     "moe_general_routing_inputs",
     "moe_TC_softmax_topk_layer",
 ]

build/torch-cuda/_ops.py CHANGED Viewed

@@ -1,8 +1,38 @@
 import torch
-ops = torch.ops._sonic_moe_2b49d3f
-def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_sonic_moe_2b49d3f::{op_name}"

 import torch
+def get_backend() -> str:
+    """Detect the backend by inspecting torch."""
+    import torch
+    if hasattr(torch, "neuron"):
+        # Needs to be sorted before specific Torch builds, since Neuron
+        # extension can be loaded into e.g. CUDA Torch builds.
+        return "neuron"
+    elif torch.version.cuda is not None:
+        return "cuda"
+    elif torch.version.hip is not None:
+        return "rocm"
+    elif torch.backends.mps.is_available():
+        return "metal"
+    elif hasattr(torch.version, "xpu") and torch.version.xpu is not None:
+        return "xpu"
+    else:
+        return "cpu"
+def _find_ops_name() -> str:
+    kernel_name = "sonic_moe"
+    unique_id = "a8c39a2"
+    backend = get_backend()
+    return f"_{kernel_name}_{backend}_{unique_id}"
+_OPS_NAME = _find_ops_name()
+ops = getattr(torch.ops, _OPS_NAME)
+def add_op_namespace_prefix(op_name: str) -> str:
     """
     Prefix op by namespace.
     """
+    return f"{_OPS_NAME}::{op_name}"

build/torch-cuda/functional/__init__.py CHANGED Viewed

@@ -6,50 +6,72 @@ import os
 import torch
 import torch.nn.functional as F
-from ..quack.gemm_interface import gemm
 from ..enums import ActivationType, is_glu
-from ..quack_utils import gemm_dgated, gemm_gated
 from .backward import (
     _down_projection_backward_act,
     _down_projection_backward_weight,
-    _softmax_topk_bwd,
     _token_broadcast_backward,
     _up_projection_backward_act,
     _up_projection_backward_weight,
 )
-from .forward import _down_projection_forward, _router_forward, _softmax_topk_fwd, _up_projection_forward
 from .triton_kernels import TC_topk_router_metadata_triton, general_routing_router_metadata_triton
-from .utils import enable_quack_gemm, is_using_quack_gemm
 class TC_Softmax_Topk_Router_Function(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, router_logits: torch.Tensor, E: int, K: int) -> tuple[torch.Tensor, torch.Tensor]:
         T = router_logits.size(0)
-        # change this to router_logits.dtype (bfloat16) increase another 5 tflops at fwd at the cost of numerical accuracy
         topk_router_score = torch.empty(T, K, dtype=torch.float32, device=router_logits.device)
         topk_router_indices = torch.empty(T, K, dtype=torch.int32, device=router_logits.device)
-        _softmax_topk_fwd(router_logits, topk_router_score, topk_router_indices, E, K)
-        ctx.save_for_backward(topk_router_score, topk_router_indices)
         ctx.E = E
         ctx.dtype = router_logits.dtype
         return topk_router_score, topk_router_indices
     @staticmethod
-    def backward(ctx, dtopk_score: torch.Tensor, _: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         T, K = dtopk_score.size()
-        topk_router_score, topk_router_indices = ctx.saved_tensors
         dlogits = torch.zeros(T, ctx.E, dtype=ctx.dtype, device=topk_router_score.device)
-        _softmax_topk_bwd(dlogits, None, dtopk_score, topk_router_score, topk_router_indices, K)
-        return dlogits, None, None
 class _UpProjection(torch.autograd.Function):
@@ -62,14 +84,14 @@ class _UpProjection(torch.autograd.Function):
         expert_frequency_offset: torch.Tensor,
         total_expert_freq: int,
         K: int,
-        stream_id: int,
         x_gather_idx: torch.Tensor,
         s_scatter_idx: torch.Tensor,
         s_reverse_scatter_idx: torch.Tensor,
         num_activated_expert_per_token_offset: torch.Tensor,
-        is_varlen_K: bool,
         activation_type: ActivationType,
         is_inference_mode_enabled: bool,
     ) -> torch.Tensor:
         T, H = x.shape
         I, H, E = w1.shape
@@ -78,34 +100,25 @@ class _UpProjection(torch.autograd.Function):
             I //= 2
         TK = total_expert_freq
-        if is_using_quack_gemm():
-            assert not torch.compiler.is_compiling()
-            assert is_glu_activation, "QuACK GEMM does not support non GLU activation yet"
-            z, y1 = gemm_gated(
-                x,
-                w1.permute(2, 1, 0),
-                activation="swiglu",
-                cu_seqlens_m=expert_frequency_offset,
-                A_idx=x_gather_idx,
-                dynamic_scheduler=False,
-            )
-        else:
-            z = torch.empty(TK, (2 * I if is_glu_activation else I), dtype=x.dtype, device=x.device)
-            y1 = torch.empty(TK, I, dtype=x.dtype, device=x.device)
-            _up_projection_forward(
-                x=x,
-                w1=w1,
-                z=z,
-                y1=y1,
-                b1=b1,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                stream_id=stream_id,
-                activation_type=activation_type.value,
-                is_glu_activation=is_glu_activation,
-                is_inference_mode_enabled=is_inference_mode_enabled,
-            )
         ctx.T = T
         ctx.TK = TK
@@ -113,9 +126,9 @@ class _UpProjection(torch.autograd.Function):
         ctx.K = K
         ctx.H = H
         ctx.I = I
-        ctx.is_varlen_K = is_varlen_K
         ctx.is_glu_activation = is_glu_activation
-        ctx.stream_id = stream_id
         ctx.save_for_backward(
             x,
@@ -128,26 +141,21 @@ class _UpProjection(torch.autograd.Function):
             num_activated_expert_per_token_offset,
         )
-        ctx.mark_non_differentiable(y1)
         ctx.set_materialize_grads(False)
-        return y1, z
     @staticmethod
-    def backward(ctx, _: None, dz: torch.Tensor):
-        is_compiling = torch.compiler.is_compiling()
-        if not is_compiling:
-            assert _ is None
         T = ctx.T
         TK = ctx.TK
         E = ctx.E
         K = ctx.K
         H = ctx.H
         is_glu_activation = ctx.is_glu_activation
-        is_varlen_K = ctx.is_varlen_K
-        stream_id = ctx.stream_id
         (
             x,
@@ -160,77 +168,57 @@ class _UpProjection(torch.autograd.Function):
             num_activated_expert_per_token_offset,
         ) = ctx.saved_tensors
         dw1 = torch.empty_like(w1)
         db1 = None if b1 is None else torch.empty_like(b1)
-        if is_using_quack_gemm():
-            assert not is_compiling
-            gemm(
-                x.T,
-                dz,
-                out=dw1.permute(2, 1, 0),
-                cu_seqlens_k=expert_frequency_offset,
-                A_idx=x_gather_idx,
-                batch_idx_permute=None,
-                dynamic_scheduler=False,
-            )
-            dx_expanded = gemm(dz, w1.permute(2, 0, 1), cu_seqlens_m=expert_frequency_offset, dynamic_scheduler=False)
-        else:
-            dx_expanded = torch.empty(TK, H, dtype=dz.dtype, device=dz.device)
-            _up_projection_backward_act(
-                w1=w1,
-                dx_expanded=dx_expanded,
-                dz=dz,
-                db1=db1,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                s_scatter_idx=s_scatter_idx,
-                is_glu_activation=is_glu_activation,
-                stream_id=stream_id,
-            )
-            _up_projection_backward_weight(
-                x=x,
-                dw1=dw1,
-                dz=dz,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                is_glu_activation=is_glu_activation,
-                stream_id=stream_id,
-            )
-        dx_reduced = torch.empty(T, H, dtype=dz.dtype, device=dz.device)
         _token_broadcast_backward(
             dx_reduced=dx_reduced,
             dx_expanded=dx_expanded,
             s_reverse_scatter_idx=s_reverse_scatter_idx,
             num_activated_expert_per_token_offset=num_activated_expert_per_token_offset,
-            varlen_K_max=(E if is_varlen_K else K),
             H=H,
-            is_varlen_K=is_varlen_K,
         )
-        return dx_reduced, dw1, db1, *[None] * 12
 class _DownProjection(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
-        y1: torch.Tensor,
-        z: torch.Tensor,
         w2: torch.Tensor,
         b2: torch.Tensor | None,
         topk_scores: torch.Tensor,
         expert_frequency_offset: torch.Tensor,
         T: int,
         K: int,
-        stream_id: int,
         x_gather_idx: torch.Tensor,
         s_scatter_idx: torch.Tensor,
         s_reverse_scatter_idx: torch.Tensor,
@@ -238,32 +226,24 @@ class _DownProjection(torch.autograd.Function):
         is_varlen_K: bool,
         activation_type: ActivationType,
     ) -> torch.Tensor:
-        TK = y1.size(0)
         H, I, E = w2.shape
-        if is_using_quack_gemm():
-            assert not torch.compiler.is_compiling()
-            assert b2 is None
-            y2 = gemm(y1, w2.permute(2, 1, 0), cu_seqlens_m=expert_frequency_offset)
-        else:
-            y2 = torch.empty(TK, H, dtype=y1.dtype, device=y1.device)
-            _down_projection_forward(
-                w2=w2,
-                y1=y1,
-                y2=y2,
-                b2=b2,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                stream_id=stream_id,
-            )
-        o = torch.empty(T, H, device=z.device, dtype=z.dtype)
-        topk_scores = topk_scores.flatten()
         _router_forward(
-            y2=y2,
             o=o,
             topk_scores=topk_scores,
             s_reverse_scatter_idx=s_reverse_scatter_idx,
@@ -277,17 +257,15 @@ class _DownProjection(torch.autograd.Function):
         ctx.K = K
         ctx.is_varlen_K = is_varlen_K
         ctx.activation_type = activation_type
-        ctx.stream_id = stream_id
         ctx.save_for_backward(
-            z,
             w2,
             b2,
             topk_scores,
             expert_frequency_offset,
             x_gather_idx,
             s_scatter_idx,
-            s_reverse_scatter_idx,
         )
         return o
@@ -296,96 +274,58 @@ class _DownProjection(torch.autograd.Function):
     def backward(ctx, dout: torch.Tensor):
         T = ctx.T
         K = ctx.K
-        stream_id = ctx.stream_id
         is_varlen_K = ctx.is_varlen_K
         activation_type = ctx.activation_type
         (
-            z,
             w2,
             b2,
             topk_scores,
             expert_frequency_offset,
             x_gather_idx,
             s_scatter_idx,
-            s_reverse_scatter_idx,
         ) = ctx.saved_tensors
         dw2 = torch.empty_like(w2)
         db2 = None if b2 is None else torch.empty_like(b2)
-        dz = torch.empty_like(z)
-        if is_using_quack_gemm():
-            assert not torch.compiler.is_compiling()
-            assert is_glu(activation_type), "QuACK GEMM does not support non GLU activation yet"
-            s = topk_scores[s_scatter_idx]
-            _, y1s, ds = gemm_dgated(
-                dout,
-                w2.permute(2, 0, 1),
-                PreAct=z,
-                activation="swiglu",
-                dx_out=dz,
-                colvec_scale=s,
-                colvec_reduce=True,
-                cu_seqlens_m=expert_frequency_offset,
-                A_idx=x_gather_idx,
-                dynamic_scheduler=False,
-            )
-            gemm(
-                dout.T,
-                y1s,
-                out=dw2.permute(2, 0, 1),
-                cu_seqlens_k=expert_frequency_offset,
-                A_idx=x_gather_idx,
-                batch_idx_permute=None,
-                dynamic_scheduler=False,
-            )
-            ds = ds[s_reverse_scatter_idx]
-        else:
-            ds = torch.empty_like(topk_scores)
-            I = w2.size(1)
-            TK = x_gather_idx.size(0)
-            y1s = torch.empty(TK, I, dtype=z.dtype, device=z.device)
-            is_glu_activation = is_glu(activation_type)
-            _down_projection_backward_act(
-                dout=dout,
-                z=z,
-                w2=w2,
-                dz=dz,
-                ds=ds,
-                b2=b2,
-                db2=db2,
-                y1s=y1s,
-                topk_scores=topk_scores,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                s_scatter_idx=s_scatter_idx,
-                is_glu_activation=is_glu_activation,
-                activation_type=activation_type.value,
-                stream_id=stream_id,
-            )
-            _down_projection_backward_weight(
-                dout=dout,
-                y1s=y1s,
-                dw2=dw2,
-                expert_frequency_offset=expert_frequency_offset,
-                expert_schedule_order=None,
-                x_gather_idx=x_gather_idx,
-                stream_id=stream_id,
-            )
         # TC top-K routing
         if not is_varlen_K:
             ds = ds.view(T, K)
-        return None, dz, dw2, db2, ds, *[None] * 10
 def moe_TC_softmax_topk_layer(
@@ -399,13 +339,18 @@ def moe_TC_softmax_topk_layer(
     stream_id: int,
     activation_type: ActivationType | str = ActivationType.SWIGLU,
     is_inference_mode_enabled: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
     ), "b1 and b2 has to be None or not None at the same time!"
     E = router_w.size(0)
     router_logits = F.linear(x, router_w)
-    topk_scores, topk_indices = TC_Softmax_Topk_Router_Function.apply(router_logits, E, K)
     T, K = topk_indices.size()
     TK = T * K
@@ -421,43 +366,43 @@ def moe_TC_softmax_topk_layer(
         topk_indices, E, expert_frequency, expert_frequency_offset, x_gather_idx, s_scatter_idx, s_reverse_scatter_idx
     )
-    T = x.size(0)
     if type(activation_type) == str:
         activation_type = ActivationType(activation_type)
-    y1, z = _UpProjection.apply(
         x,
         w1,
         b1,
         expert_frequency_offset,
-        T * K,
         K,
-        stream_id,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         None,
-        False,  # is_varlen_K
         activation_type,
         is_inference_mode_enabled,
     )
     o = _DownProjection.apply(
-        y1,
-        z,
         w2,
         b2,
         topk_scores,
         expert_frequency_offset,
         T,
         K,
-        stream_id,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         None,
-        False,  # is_varlen_K
         activation_type,
     )
@@ -466,7 +411,9 @@ def moe_TC_softmax_topk_layer(
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # Weight format requirements:
-# - w1_weight: Shape (2*I, H, E), stride order (2, 0, 1), must be interleaved [gate_row0, up_row0, gate_row1, up_row1, ...]
 # - w2_weight: Shape (H, I, E), stride order (2, 0, 1)
@@ -486,6 +433,7 @@ def moe_general_routing_inputs(
     stream_id: int,
     activation_type: ActivationType,
     is_inference_mode_enabled: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
@@ -496,6 +444,9 @@ def moe_general_routing_inputs(
     E = w2.size(-1)
     device = router_scores.device
     s_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
     s_reverse_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
     expert_frequency = torch.empty(E, dtype=torch.int32, device=device)
@@ -516,38 +467,40 @@ def moe_general_routing_inputs(
         num_activated_expert_per_token_offset,
     )
-    y1, z = _UpProjection.apply(
         x,
         w1,
         b1,
         expert_frequency_offset,
         TK,
         None,  # K, not needed
-        stream_id,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         num_activated_expert_per_token_offset,
-        True,  # is_varlen_K
         activation_type,
         is_inference_mode_enabled,
     )
     o = _DownProjection.apply(
-        y1,
-        z,
         w2,
         b2,
         router_scores,
         expert_frequency_offset,
         T,
         None,  # K, not needed
-        stream_id,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         num_activated_expert_per_token_offset,
-        True,  # is_varlen_K
         activation_type,
     )

 import torch
 import torch.nn.functional as F
+from ..quack.gemm_interface import gemm, gemm_dgated, gemm_gated
 from ..enums import ActivationType, is_glu
 from .backward import (
     _down_projection_backward_act,
     _down_projection_backward_weight,
     _token_broadcast_backward,
+    _topk_softmax_bwd,
     _up_projection_backward_act,
     _up_projection_backward_weight,
 )
+from .forward import _down_projection_forward, _router_forward, _topk_softmax_fwd, _up_projection_forward
 from .triton_kernels import TC_topk_router_metadata_triton, general_routing_router_metadata_triton
 class TC_Softmax_Topk_Router_Function(torch.autograd.Function):
     @staticmethod
+    def forward(
+        ctx, router_logits: torch.Tensor, E: int, K: int, is_softmax_over_topk: bool, norm_topk_probs: bool
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         T = router_logits.size(0)
         topk_router_score = torch.empty(T, K, dtype=torch.float32, device=router_logits.device)
         topk_router_indices = torch.empty(T, K, dtype=torch.int32, device=router_logits.device)
+        _topk_softmax_fwd(
+            router_logits,
+            topk_router_score,
+            topk_router_indices,
+            E,
+            K,
+            is_softmax_over_topk=is_softmax_over_topk,
+            norm_topk_probs=norm_topk_probs,
+        )
+        # Save router_logits for topk(softmax()) backward (recompute full softmax).
+        # For softmax(topk()) it's unused but save unconditionally for simplicity.
+        ctx.save_for_backward(topk_router_score, topk_router_indices, router_logits)
         ctx.E = E
         ctx.dtype = router_logits.dtype
+        ctx.is_softmax_over_topk = is_softmax_over_topk
+        ctx.norm_topk_probs = norm_topk_probs
         return topk_router_score, topk_router_indices
     @staticmethod
+    def backward(ctx, dtopk_score: torch.Tensor, _: torch.Tensor):
         T, K = dtopk_score.size()
+        E = ctx.E
+        topk_router_score, topk_router_indices, router_logits = ctx.saved_tensors
         dlogits = torch.zeros(T, ctx.E, dtype=ctx.dtype, device=topk_router_score.device)
+        _topk_softmax_bwd(
+            router_logits,
+            dlogits,
+            None,
+            dtopk_score,
+            topk_router_score,
+            topk_router_indices,
+            E,
+            K,
+            is_softmax_over_topk=ctx.is_softmax_over_topk,
+            norm_topk_probs=ctx.norm_topk_probs,
+        )
+        return dlogits, None, None, None, None
 class _UpProjection(torch.autograd.Function):
         expert_frequency_offset: torch.Tensor,
         total_expert_freq: int,
         K: int,
         x_gather_idx: torch.Tensor,
         s_scatter_idx: torch.Tensor,
         s_reverse_scatter_idx: torch.Tensor,
         num_activated_expert_per_token_offset: torch.Tensor,
+        is_each_token_has_variable_activated_experts: bool,
         activation_type: ActivationType,
         is_inference_mode_enabled: bool,
+        concat_layout: bool = False,
     ) -> torch.Tensor:
         T, H = x.shape
         I, H, E = w1.shape
             I //= 2
         TK = total_expert_freq
+        a = torch.empty(TK, I, dtype=x.dtype, device=x.device)
+        h = (
+            torch.empty(TK, (2 * I if is_glu_activation else I), dtype=x.dtype, device=x.device)
+            if (not is_inference_mode_enabled)
+            else None
+        )
+        _up_projection_forward(
+            x=x,
+            w1=w1,
+            h=h,
+            a=a,
+            b1=b1,
+            expert_frequency_offset=expert_frequency_offset,
+            x_gather_idx=x_gather_idx,
+            activation_type=activation_type.value,
+            is_inference_mode_enabled=is_inference_mode_enabled,
+            concat_layout=concat_layout,
+        )
         ctx.T = T
         ctx.TK = TK
         ctx.K = K
         ctx.H = H
         ctx.I = I
+        ctx.is_each_token_has_variable_activated_experts = is_each_token_has_variable_activated_experts
         ctx.is_glu_activation = is_glu_activation
+        ctx.concat_layout = concat_layout
         ctx.save_for_backward(
             x,
             num_activated_expert_per_token_offset,
         )
+        ctx.mark_non_differentiable(a)
         ctx.set_materialize_grads(False)
+        return a, h
     @staticmethod
+    def backward(ctx, _: None, dh: torch.Tensor):
         T = ctx.T
         TK = ctx.TK
         E = ctx.E
         K = ctx.K
         H = ctx.H
         is_glu_activation = ctx.is_glu_activation
+        is_each_token_has_variable_activated_experts = ctx.is_each_token_has_variable_activated_experts
+        concat_layout = ctx.concat_layout
         (
             x,
             num_activated_expert_per_token_offset,
         ) = ctx.saved_tensors
+        dx_expanded = torch.empty(TK, H, dtype=dh.dtype, device=dh.device)
         dw1 = torch.empty_like(w1)
         db1 = None if b1 is None else torch.empty_like(b1)
+        _up_projection_backward_act(
+            w1=w1,
+            dx_expanded=dx_expanded,
+            dh=dh,
+            db1=db1,
+            expert_frequency_offset=expert_frequency_offset,
+            is_glu_activation=is_glu_activation,
+            concat_layout=concat_layout,
+        )
+        _up_projection_backward_weight(
+            x=x,
+            dw1=dw1,
+            dh=dh,
+            expert_frequency_offset=expert_frequency_offset,
+            x_gather_idx=x_gather_idx,
+            is_glu_activation=is_glu_activation,
+            concat_layout=concat_layout,
+        )
+        dx_reduced = torch.empty(T, H, dtype=dh.dtype, device=dh.device)
         _token_broadcast_backward(
             dx_reduced=dx_reduced,
             dx_expanded=dx_expanded,
             s_reverse_scatter_idx=s_reverse_scatter_idx,
             num_activated_expert_per_token_offset=num_activated_expert_per_token_offset,
+            varlen_K_max=(E if is_each_token_has_variable_activated_experts else K),
             H=H,
+            is_varlen_K=is_each_token_has_variable_activated_experts,
         )
+        return dx_reduced, dw1, db1, *[None] * 13
 class _DownProjection(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
+        a: torch.Tensor,
+        h: torch.Tensor,
         w2: torch.Tensor,
         b2: torch.Tensor | None,
         topk_scores: torch.Tensor,
         expert_frequency_offset: torch.Tensor,
         T: int,
         K: int,
         x_gather_idx: torch.Tensor,
         s_scatter_idx: torch.Tensor,
         s_reverse_scatter_idx: torch.Tensor,
         is_varlen_K: bool,
         activation_type: ActivationType,
     ) -> torch.Tensor:
+        TK = a.size(0)
         H, I, E = w2.shape
+        y = torch.empty(TK, H, dtype=a.dtype, device=a.device)
+        _down_projection_forward(
+            w2=w2,
+            a=a,
+            y=y,
+            b2=b2,
+            expert_frequency_offset=expert_frequency_offset,
+        )
+        o = torch.empty(T, H, device=a.device, dtype=a.dtype)
+        topk_scores = topk_scores.view(-1)
         _router_forward(
+            y=y,
             o=o,
             topk_scores=topk_scores,
             s_reverse_scatter_idx=s_reverse_scatter_idx,
         ctx.K = K
         ctx.is_varlen_K = is_varlen_K
         ctx.activation_type = activation_type
         ctx.save_for_backward(
+            h,
             w2,
             b2,
             topk_scores,
             expert_frequency_offset,
             x_gather_idx,
             s_scatter_idx,
         )
         return o
     def backward(ctx, dout: torch.Tensor):
         T = ctx.T
         K = ctx.K
         is_varlen_K = ctx.is_varlen_K
         activation_type = ctx.activation_type
         (
+            h,
             w2,
             b2,
             topk_scores,
             expert_frequency_offset,
             x_gather_idx,
             s_scatter_idx,
         ) = ctx.saved_tensors
         dw2 = torch.empty_like(w2)
         db2 = None if b2 is None else torch.empty_like(b2)
+        dh = torch.empty_like(h)
+        I = w2.size(1)
+        TK = x_gather_idx.size(0)
+        a_prime = torch.empty(TK, I, dtype=h.dtype, device=h.device)
+        ds = torch.empty_like(topk_scores)
+        _down_projection_backward_act(
+            dout=dout,
+            h=h,
+            w2=w2,
+            dh=dh,
+            ds=ds,
+            b2=b2,
+            db2=db2,
+            a_prime=a_prime,
+            topk_scores=topk_scores,
+            expert_frequency_offset=expert_frequency_offset,
+            x_gather_idx=x_gather_idx,
+            s_scatter_idx=s_scatter_idx,
+            activation_type=activation_type.value,
+        )
+        _down_projection_backward_weight(
+            dout=dout,
+            a_prime=a_prime,
+            dw2=dw2,
+            expert_frequency_offset=expert_frequency_offset,
+            x_gather_idx=x_gather_idx,
+        )
         # TC top-K routing
         if not is_varlen_K:
             ds = ds.view(T, K)
+        return None, dh, dw2, db2, ds, *[None] * 10
 def moe_TC_softmax_topk_layer(
     stream_id: int,
     activation_type: ActivationType | str = ActivationType.SWIGLU,
     is_inference_mode_enabled: bool = False,
+    is_softmax_over_topk: bool = True,
+    norm_topk_probs: bool = False,
+    concat_layout: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
     ), "b1 and b2 has to be None or not None at the same time!"
     E = router_w.size(0)
     router_logits = F.linear(x, router_w)
+    topk_scores, topk_indices = TC_Softmax_Topk_Router_Function.apply(
+        router_logits, E, K, is_softmax_over_topk, norm_topk_probs
+    )
     T, K = topk_indices.size()
     TK = T * K
         topk_indices, E, expert_frequency, expert_frequency_offset, x_gather_idx, s_scatter_idx, s_reverse_scatter_idx
     )
     if type(activation_type) == str:
         activation_type = ActivationType(activation_type)
+    assert not torch.compiler.is_compiling()
+    assert is_glu(activation_type), "QuACK GEMM does not support non GLU activation yet"
+    a, h = _UpProjection.apply(
         x,
         w1,
         b1,
         expert_frequency_offset,
+        TK,
         K,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         None,
+        False,  # is_each_token_has_variable_activated_expert
         activation_type,
         is_inference_mode_enabled,
+        concat_layout,
     )
     o = _DownProjection.apply(
+        a,
+        h,
         w2,
         b2,
         topk_scores,
         expert_frequency_offset,
         T,
         K,
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         None,
+        False,  # is_each_token_has_variable_activated_expert
         activation_type,
     )
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # Weight format requirements:
+# - w1_weight: Shape (2*I, H, E), stride order (2, 0, 1)
+#     concat_layout=False (default): interleaved [gate_row0, up_row0, gate_row1, up_row1, ...]
+#     concat_layout=True:            concatenated [gate_row0, ..., gate_row_{I-1}, up_row0, ..., up_row_{I-1}]
 # - w2_weight: Shape (H, I, E), stride order (2, 0, 1)
     stream_id: int,
     activation_type: ActivationType,
     is_inference_mode_enabled: bool = False,
+    concat_layout: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
     E = w2.size(-1)
     device = router_scores.device
+    if router_scores.dtype != torch.float32:
+        router_scores = router_scores.float()
     s_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
     s_reverse_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
     expert_frequency = torch.empty(E, dtype=torch.int32, device=device)
         num_activated_expert_per_token_offset,
     )
+    assert not torch.compiler.is_compiling()
+    assert is_glu(activation_type), "QuACK GEMM does not support non GLU activation yet"
+    a, h = _UpProjection.apply(
         x,
         w1,
         b1,
         expert_frequency_offset,
         TK,
         None,  # K, not needed
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         num_activated_expert_per_token_offset,
+        True,  # is_each_token_has_variable_activated_expert
         activation_type,
         is_inference_mode_enabled,
+        concat_layout,
     )
     o = _DownProjection.apply(
+        a,
+        h,
         w2,
         b2,
         router_scores,
         expert_frequency_offset,
         T,
         None,  # K, not needed
         x_gather_idx,
         s_scatter_idx,
         s_reverse_scatter_idx,
         num_activated_expert_per_token_offset,
+        True,  # is_each_token_has_variable_activated_expert
         activation_type,
     )

build/torch-cuda/functional/backward.py CHANGED Viewed

@@ -9,16 +9,10 @@ import cutlass.cute as cute
 import torch
 import triton
 import triton.language as tl
 from .._ops_compat import add_op_namespace_prefix
-from ..enums import LIBRARY_NAME, TENSORMAP, ActivationType
-from ..utils import ceil_divide, convert_torch_tensor_to_cute_tensor, get_powers_of_2
-from .moe_config import (
-    HopperWgmma_MoE_Down_proj_ActGrad_Bwd,
-    HopperWgmma_MoE_Down_proj_WeightGrad_Bwd,
-    HopperWgmma_MoE_Up_proj_ActGrad_Bwd,
-    HopperWgmma_MoE_Up_proj_WeightGrad_Bwd,
-)
 from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
@@ -132,28 +126,29 @@ def _prune_triton_autotune_config(configs, nargs, **kw):
 )
 @triton.jit
 def db1_kernel(
-    dz_ptr,  # (T, H)
-    db1_ptr,  # (E, H),
-    expert_offset_ptr,  # (E+1,), offsets in grouped layout
     I: tl.constexpr,
     E: tl.constexpr,
-    BLOCK_I: tl.constexpr,  # Block size for H dimension
-    BLOCK_TK: tl.constexpr,  # Block size for token dimension
 ):
-    Eidx = tl.program_id(0)  # expert id
     E_count_start = tl.load(expert_offset_ptr + Eidx).to(tl.int64)
     E_count_end = tl.load(expert_offset_ptr + Eidx + 1).to(tl.int64)
     n_tokens = E_count_end - E_count_start
     NUM_I_BLOCKS: tl.constexpr = triton.cdiv(I, BLOCK_I)
     for Iidx in tl.static_range(0, NUM_I_BLOCKS, 1):
         i_offsets = Iidx * BLOCK_I + tl.arange(0, BLOCK_I)
         i_mask = i_offsets < I
         db1_acc = tl.zeros([BLOCK_I], dtype=tl.float32)
-        # Process tokens in blocks of BLOCK_TK
         for block_start in tl.range(0, n_tokens, BLOCK_TK):
             # Token offsets within this block
             tk_offsets = block_start + tl.arange(0, BLOCK_TK)
@@ -162,102 +157,52 @@ def db1_kernel(
             dz_offsets = tk_grouped[:, None] * I + i_offsets[None, :]
             dz_mask = tk_mask[:, None] & i_mask[None, :]
-            dz = tl.load(dz_ptr + dz_offsets, mask=dz_mask, other=0.0).to(tl.float32)
-            db1_acc += tl.sum(dz, axis=0)  # Sum over BLOCK_TK dimension
-        db1_offsets = Eidx.to(tl.int64) * I + i_offsets
         tl.store(db1_ptr + db1_offsets, db1_acc, mask=i_mask)
-@triton.jit
-def _colsum_smallN_kernel(
-    y_ptr,  # *mut  T, shape [M]
-    x_ptr,  # *const T, shape [M, N]
-    stride_xm: tl.constexpr,
-    stride_xn: tl.constexpr,  # strides of X
-    stride_y: tl.constexpr,  # stride of Y (usually 1)
-    N: tl.constexpr,  # sizes
-    BLOCK_N: tl.constexpr,  # tile size along N
-):
-    row = tl.program_id(0)
-    # assume BLOCK_N >= N
-    offs = tl.arange(0, BLOCK_N)
-    mask = offs < N
-    # Load a tile from the row; cast to fp32 for the reduction
-    x = tl.load(x_ptr + row * stride_xm + offs * stride_xn, mask=mask, other=0).to(tl.float32)
-    # Reduce this tile to a scalar and add
-    acc = tl.sum(x, axis=0)
-    # Store the row-sum (cast back to y dtype)
-    tl.store(y_ptr + row * stride_y, acc)
 @torch.library.custom_op(add_op_namespace_prefix("_up_projection_backward_act"), mutates_args={"dx_expanded", "db1"})
 def _up_projection_backward_act(
     w1: torch.Tensor,
     dx_expanded: torch.Tensor,
-    dz: torch.Tensor,
     db1: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor | None,
-    x_gather_idx: torch.Tensor,
-    s_scatter_idx: torch.Tensor,
     is_glu_activation: bool,
-    stream_id: int,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
         I //= 2
     # db1 computation
     if db1 is not None:
-        db1_kernel[(E,)](dz, db1, expert_frequency_offset, (2 * I if is_glu_activation else I), E)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    mS_scatter = convert_torch_tensor_to_cute_tensor(s_scatter_idx, (0,), 0, 4, 1, stream=stream_id)
-    mDz = convert_torch_tensor_to_cute_tensor(dz, (0, 1), 1, 16, 8, stream=stream_id)
-    mDx_expanded = convert_torch_tensor_to_cute_tensor(dx_expanded, (0, 1), 1, 16, 8, stream=stream_id)
-    mW1_trans = convert_torch_tensor_to_cute_tensor(w1.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    compile_dx_key = ("dx", E, H, I, is_glu_activation, dx_expanded.dtype)
-    if compile_dx_key not in _up_projection_backward_act.compile_cache:
-        dx_module = HopperWgmma_MoE_Up_proj_ActGrad_Bwd(E, H, I, is_glu_activation)
-        tensormaps = [dx_module.module.generate_tensormap(None, None, None) for _ in range(2)]
-        _up_projection_backward_act.compile_cache[compile_dx_key] = cute.compile(
-            dx_module,
-            mDz,
-            mW1_trans,
-            mDx_expanded,
-            mE_offset,
-            mX_gather,
-            mS_scatter,
-            tensormaps,
-            mE_permute_order,
-            current_stream,
         )
-        _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"] = tensormaps
-    dx_tensormaps = _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"]
-    _up_projection_backward_act.compile_cache[compile_dx_key](
-        mDz,
-        mW1_trans,
-        mDx_expanded,
-        mE_offset,
-        mX_gather,
-        mS_scatter,
-        dx_tensormaps,
-        mE_permute_order,
-        current_stream,
-    )
 _up_projection_backward_act.compile_cache = {}
@@ -267,199 +212,87 @@ _up_projection_backward_act.compile_cache = {}
 def _up_projection_backward_weight(
     x: torch.Tensor,
     dw1: torch.Tensor,
-    dz: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor | None,
     x_gather_idx: torch.Tensor,
     is_glu_activation: bool,
-    stream_id: int,
 ) -> None:
     I, H, E = dw1.size()
     if is_glu_activation:
         I //= 2
-    x = x.detach()
-    mDz_trans = convert_torch_tensor_to_cute_tensor(dz.T, (1, 0), 0, 16, 8, stream=stream_id)
-    mDw1_trans = convert_torch_tensor_to_cute_tensor(dw1.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
-    mX_trans = convert_torch_tensor_to_cute_tensor(x.T, (1, 0), 0, 16, 8, stream=stream_id)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    compile_dw1_key = ("dw1", E, H, I, is_glu_activation, x.dtype)
-    if compile_dw1_key not in _up_projection_backward_weight.compile_cache:
-        dw1_module = HopperWgmma_MoE_Up_proj_WeightGrad_Bwd(E, H, I, is_glu_activation)
-        tensormaps = [dw1_module.module.generate_tensormap(None, None, None) for _ in range(1)]
-        _up_projection_backward_weight.compile_cache[compile_dw1_key] = cute.compile(
-            dw1_module,
-            mX_trans,
-            mDz_trans,
-            mDw1_trans,
-            mE_offset,
-            mX_gather,
-            tensormaps,
-            mE_permute_order,
-            current_stream,
-        )
-        _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"] = tensormaps
-    dw1_tensormaps = _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"]
-    _up_projection_backward_weight.compile_cache[compile_dw1_key](
-        mX_trans,
-        mDz_trans,
-        mDw1_trans,
-        mE_offset,
-        mX_gather,
-        dw1_tensormaps,
-        mE_permute_order,
-        current_stream,
     )
 _up_projection_backward_weight.compile_cache = {}
-@torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_act"), mutates_args={"dz", "ds", "db2", "y1s"})
 def _down_projection_backward_act(
     dout: torch.Tensor,
-    z: torch.Tensor,
     w2: torch.Tensor,
-    dz: torch.Tensor,
     ds: torch.Tensor,
     b2: torch.Tensor | None,
-    db2: torch.Tensor | None,
-    y1s: torch.Tensor,
     topk_scores: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor | None,
     x_gather_idx: torch.Tensor,
     s_scatter_idx: torch.Tensor,
-    is_glu_activation: bool,
     activation_type: str,
-    stream_id: int,
 ) -> None:
-    H, I, E = w2.size()
-    TK = x_gather_idx.size(0)
-    dout = dout.detach()
-    w2 = w2.detach()
-    topk_scores = topk_scores.detach()
-    mDout = convert_torch_tensor_to_cute_tensor(dout, (0, 1), 1, 16, 8, stream=stream_id)
-    mW2_trans = convert_torch_tensor_to_cute_tensor(w2.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
-    mS = convert_torch_tensor_to_cute_tensor(topk_scores, (0,), 0, 4, 1, stream=stream_id)
-    if is_glu_activation:
-        mDz_kernel_input = convert_torch_tensor_to_cute_tensor(
-            dz.view(torch.float32), (0, 1), 1, 16, 8, stream=stream_id
-        )
-        mZ_kernel_input = convert_torch_tensor_to_cute_tensor(
-            z.view(torch.float32), (0, 1), 1, 16, 8, stream=stream_id
-        )
-    else:
-        mDz_kernel_input = convert_torch_tensor_to_cute_tensor(dz.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-        mZ_kernel_input = convert_torch_tensor_to_cute_tensor(z.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-    mY1S = convert_torch_tensor_to_cute_tensor(y1s, (0, 1), 1, 16, 8, stream=stream_id)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    mS_scatter = convert_torch_tensor_to_cute_tensor(s_scatter_idx, (0,), 0, 4, 1, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    ds_partial = None
-    compile_dz_key = ("dz", E, H, I, z.dtype, activation_type)
-    if compile_dz_key not in _down_projection_backward_act.compile_cache:
-        # I don't know why but this sync appears to fix a mysterious initialization bug??
-        torch.cuda.synchronize()
-        dz_module = HopperWgmma_MoE_Down_proj_ActGrad_Bwd(E, H, I, ActivationType(activation_type))
-        tensormaps = [dz_module.module.generate_tensormap(None, None, None) for _ in range(3)]
-        ds_partial_N = max(ceil_divide(I, dz_module.module.tile_shape_mnk[1]), 1)
-        ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
-        mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
-        _down_projection_backward_act.compile_cache["ds_partial_N"] = ds_partial_N
-        _down_projection_backward_act.compile_cache[compile_dz_key] = cute.compile(
-            dz_module,
-            mDout,
-            mW2_trans,
-            mZ_kernel_input,
-            mDz_kernel_input,
-            mY1S,
-            mS,
-            mDS_partial,
-            mE_offset,
-            mX_gather,
-            mS_scatter,
-            tensormaps,
-            mE_permute_order,
-            current_stream,
-        )
-        _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"] = tensormaps
-    if ds_partial is None:
-        ds_partial_N = _down_projection_backward_act.compile_cache["ds_partial_N"]
-        ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
-        mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
-    dz_tensormaps = _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"]
-    _down_projection_backward_act.compile_cache[compile_dz_key](
-        mDout,
-        mW2_trans,
-        mZ_kernel_input,
-        mDz_kernel_input,
-        mY1S,
-        mS,
-        mDS_partial,
-        mE_offset,
-        mX_gather,
-        mS_scatter,
-        dz_tensormaps,
-        mE_permute_order,
-        current_stream,
     )
     if db2 is None:
-        # we don't need to update ds
-        if ds_partial.size(1) == 1:
-            ds.copy_(ds_partial.view(-1).to(dtype=ds.dtype))
-        elif ds_partial.size(1) <= 32:
-            ds.copy_(ds_partial.sum(dim=-1, dtype=ds.dtype))
-        else:
-            M, N = ds_partial.size()
-            _colsum_smallN_kernel[M,](
-                y_ptr=ds,
-                x_ptr=ds_partial,
-                stride_xm=ds_partial.stride(0),
-                stride_xn=ds_partial.stride(1),
-                stride_y=1,
-                N=N,
-                BLOCK_N=triton.next_power_of_2(N),
-            )
     else:
-        # db2 and ds update
         BLOCK_H = min(triton.next_power_of_2(H), 2048)
         NUM_H_BLOCKS = triton.cdiv(H, BLOCK_H)
-        new_ds_partial = torch.empty(TK, NUM_H_BLOCKS, device=ds.device, dtype=torch.float32)
         db2_and_ds_kernel[(E, NUM_H_BLOCKS)](
             dout,
             topk_scores,
             new_ds_partial,
-            ds_partial,
             b2,
             db2,
             x_gather_idx,
@@ -467,9 +300,9 @@ def _down_projection_backward_act(
             expert_frequency_offset,
             H,
             E,
-            ds_partial_N,
             BLOCK_H=BLOCK_H,
-            BLOCK_OLD_DS_PARTIAL_N=triton.next_power_of_2(ds_partial_N),
         )
         if NUM_H_BLOCKS == 1:
@@ -484,47 +317,19 @@ _down_projection_backward_act.compile_cache = {}
 @torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_weight"), mutates_args={"dw2"})
 def _down_projection_backward_weight(
     dout: torch.Tensor,
-    y1s: torch.Tensor,
     dw2: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor | None,
     x_gather_idx: torch.Tensor,
-    stream_id: int,
 ) -> None:
-    H, I, E = dw2.size()
-    mDout_trans = convert_torch_tensor_to_cute_tensor(dout.T, (1, 0), 0, 16, 8, stream=stream_id)
-    mDw2 = convert_torch_tensor_to_cute_tensor(dw2, (2, 0, 1), 1, 16, 8, stream=stream_id)
-    mY1S_trans = convert_torch_tensor_to_cute_tensor(y1s.T, (1, 0), 0, 16, 8, stream=stream_id)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    compile_dw2_key = ("dw2", E, H, I, dw2.dtype)
-    if compile_dw2_key not in _down_projection_backward_weight.compile_cache:
-        dw2_module = HopperWgmma_MoE_Down_proj_WeightGrad_Bwd(E, H, I)
-        tensormaps = [dw2_module.module.generate_tensormap(None, None, None) for _ in range(1)]
-        _down_projection_backward_weight.compile_cache[compile_dw2_key] = cute.compile(
-            dw2_module,
-            mDout_trans,
-            mY1S_trans,
-            mDw2,
-            mE_offset,
-            mX_gather,
-            tensormaps,
-            mE_permute_order,
-            current_stream,
-        )
-        _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"] = tensormaps
-    dw2_tensormaps = _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"]
-    _down_projection_backward_weight.compile_cache[compile_dw2_key](
-        mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, dw2_tensormaps, mE_permute_order, current_stream
     )
@@ -557,7 +362,7 @@ def _token_broadcast_backward(
 @triton.jit
-def _softmax_bwd_scatter_small_kernel(
     dlogits_ptr,
     dlogits_full_ptr,
     score_ptr,
@@ -597,35 +402,171 @@ def _softmax_bwd_scatter_small_kernel(
     tl.store(dlogits_full_ptr + indices, add_vals, mask=k_mask)
-@torch.library.custom_op(add_op_namespace_prefix("_softmax_topk_bwd"), mutates_args={"dlogits_full"})
-def _softmax_topk_bwd(
     dlogits_full: torch.Tensor,
     dlogits: Optional[torch.Tensor],
     dtopk_score: torch.Tensor,
     topk_router_score: torch.Tensor,
     topk_router_indices: torch.Tensor,
     K: int,
 ) -> None:
     T = dtopk_score.shape[0]
-    _softmax_bwd_scatter_small_kernel[T,](
-        dlogits,
-        dlogits_full,
-        topk_router_score,
-        dtopk_score,
-        topk_router_indices,
-        dlogits_full.stride(0),
-        dlogits_full.stride(1),
-        topk_router_score.stride(0),
-        topk_router_score.stride(1),
-        dtopk_score.stride(0),
-        dtopk_score.stride(1),
-        topk_router_indices.stride(0),
-        topk_router_indices.stride(1),
-        K,
-        triton.next_power_of_2(K),
-        (dlogits is None),
-    )
 @triton.jit

 import torch
 import triton
 import triton.language as tl
+from ..quack.gemm_interface import gemm, gemm_dgated
 from .._ops_compat import add_op_namespace_prefix
+from ..utils import get_powers_of_2
 from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
 )
 @triton.jit
 def db1_kernel(
+    dh_ptr,  # (TK, I)  — always interleaved
+    db1_ptr,  # (E, I)
+    expert_offset_ptr,  # (E+1,)
     I: tl.constexpr,
     E: tl.constexpr,
+    BLOCK_I: tl.constexpr,
+    BLOCK_TK: tl.constexpr,
+    CONCAT_LAYOUT: tl.constexpr = False,
 ):
+    Eidx = tl.program_id(0)
     E_count_start = tl.load(expert_offset_ptr + Eidx).to(tl.int64)
     E_count_end = tl.load(expert_offset_ptr + Eidx + 1).to(tl.int64)
     n_tokens = E_count_end - E_count_start
     NUM_I_BLOCKS: tl.constexpr = triton.cdiv(I, BLOCK_I)
+    I_HALF: tl.constexpr = I // 2
     for Iidx in tl.static_range(0, NUM_I_BLOCKS, 1):
         i_offsets = Iidx * BLOCK_I + tl.arange(0, BLOCK_I)
         i_mask = i_offsets < I
         db1_acc = tl.zeros([BLOCK_I], dtype=tl.float32)
         for block_start in tl.range(0, n_tokens, BLOCK_TK):
             # Token offsets within this block
             tk_offsets = block_start + tl.arange(0, BLOCK_TK)
             dz_offsets = tk_grouped[:, None] * I + i_offsets[None, :]
             dz_mask = tk_mask[:, None] & i_mask[None, :]
+            dz = tl.load(dh_ptr + dz_offsets, mask=dz_mask, other=0.0).to(tl.float32)
+            db1_acc += tl.sum(dz, axis=0)
+        # Write: remap interleaved → concat if needed
+        if CONCAT_LAYOUT:
+            out_offsets = i_offsets // 2 + (i_offsets % 2) * I_HALF
+        else:
+            out_offsets = i_offsets
+        db1_offsets = Eidx.to(tl.int64) * I + out_offsets
         tl.store(db1_ptr + db1_offsets, db1_acc, mask=i_mask)
 @torch.library.custom_op(add_op_namespace_prefix("_up_projection_backward_act"), mutates_args={"dx_expanded", "db1"})
 def _up_projection_backward_act(
     w1: torch.Tensor,
     dx_expanded: torch.Tensor,
+    dh: torch.Tensor,
     db1: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
     is_glu_activation: bool,
+    concat_layout: bool = False,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
         I //= 2
+    gemm(
+        dh,
+        w1.permute(2, 0, 1),
+        cu_seqlens_m=expert_frequency_offset,
+        dynamic_scheduler=False,
+        out=dx_expanded,
+        concat_layout=(("B",) if concat_layout else None),
+    )
     # db1 computation
     if db1 is not None:
+        db1_kernel[(E,)](
+            dh,
+            db1,
+            expert_frequency_offset,
+            (2 * I if is_glu_activation else I),
+            E,
+            CONCAT_LAYOUT=concat_layout and is_glu_activation,
         )
 _up_projection_backward_act.compile_cache = {}
 def _up_projection_backward_weight(
     x: torch.Tensor,
     dw1: torch.Tensor,
+    dh: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
     x_gather_idx: torch.Tensor,
     is_glu_activation: bool,
+    concat_layout: bool = False,
 ) -> None:
     I, H, E = dw1.size()
     if is_glu_activation:
         I //= 2
+    gemm(
+        x.T,
+        dh,
+        out=dw1.permute(2, 1, 0),
+        cu_seqlens_k=expert_frequency_offset,
+        A_idx=x_gather_idx,
+        batch_idx_permute=None,
+        dynamic_scheduler=False,
+        concat_layout=(("out",) if concat_layout else None),
     )
 _up_projection_backward_weight.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_act"), mutates_args={"dh", "ds", "db2", "a_prime"})
 def _down_projection_backward_act(
     dout: torch.Tensor,
+    h: torch.Tensor,
     w2: torch.Tensor,
+    dh: torch.Tensor,
     ds: torch.Tensor,
     b2: torch.Tensor | None,
+    db2: torch.Tensor | None,  # add impl later
+    a_prime: torch.Tensor,
     topk_scores: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
     x_gather_idx: torch.Tensor,
     s_scatter_idx: torch.Tensor,
     activation_type: str,
 ) -> None:
+    assert activation_type in (
+        "swiglu",
+        "geglu",
+    ), f"QuACK gemm_gated only supports glu activations, got {activation_type}"
+    s = topk_scores[s_scatter_idx]
+    _, _, ds_scattered = gemm_dgated(
+        dout,
+        w2.permute(2, 0, 1),
+        PreAct=h,
+        activation=activation_type,
+        dx_out=dh,
+        postact_out=a_prime,
+        colvec_scale=s,
+        colvec_reduce=True,
+        cu_seqlens_m=expert_frequency_offset,
+        A_idx=x_gather_idx,
+        dynamic_scheduler=False,
     )
+    ds[s_scatter_idx] = ds_scattered
     if db2 is None:
+        ds[s_scatter_idx] = ds_scattered
     else:
+        H = w2.size(0)
+        E = expert_frequency_offset.size(0) - 1
+        TK = x_gather_idx.size(0)
+        old_ds_partial = torch.empty(TK, 1, device=ds_scattered.device, dtype=ds_scattered.dtype)
+        old_ds_partial[s_scatter_idx, 0] = ds_scattered
         BLOCK_H = min(triton.next_power_of_2(H), 2048)
         NUM_H_BLOCKS = triton.cdiv(H, BLOCK_H)
+        new_ds_partial = torch.empty(TK, NUM_H_BLOCKS, dtype=torch.float32, device=ds.device)
         db2_and_ds_kernel[(E, NUM_H_BLOCKS)](
             dout,
             topk_scores,
             new_ds_partial,
+            old_ds_partial,
             b2,
             db2,
             x_gather_idx,
             expert_frequency_offset,
             H,
             E,
+            1,  # OLD_DS_PARTIAL_N = 1
             BLOCK_H=BLOCK_H,
+            BLOCK_OLD_DS_PARTIAL_N=1,
         )
         if NUM_H_BLOCKS == 1:
 @torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_weight"), mutates_args={"dw2"})
 def _down_projection_backward_weight(
     dout: torch.Tensor,
+    a_prime: torch.Tensor,
     dw2: torch.Tensor,
     expert_frequency_offset: torch.Tensor,
     x_gather_idx: torch.Tensor,
 ) -> None:
+    gemm(
+        dout.T,
+        a_prime,
+        out=dw2.permute(2, 0, 1),
+        cu_seqlens_k=expert_frequency_offset,
+        A_idx=x_gather_idx,
+        batch_idx_permute=None,
+        dynamic_scheduler=False,
     )
 @triton.jit
+def _softmax_over_topk_bwd_kernel(
     dlogits_ptr,
     dlogits_full_ptr,
     score_ptr,
     tl.store(dlogits_full_ptr + indices, add_vals, mask=k_mask)
+@triton.jit
+def _topk_over_softmax_bwd_kernel(
+    logits_ptr,  # (T, N) saved router logits
+    dlogits_ptr,  # (T, N) output gradient
+    dscore_ptr,  # (T, K) upstream gradient
+    idx_ptr,  # (T, K) selected indices (int32)
+    score_ptr,  # (T, K) forward scores (only used for renorm)
+    stride_lm: tl.constexpr,
+    stride_le: tl.constexpr,
+    stride_dm: tl.constexpr,
+    stride_dn: tl.constexpr,
+    stride_sm: tl.constexpr,
+    stride_sn: tl.constexpr,
+    stride_im: tl.constexpr,
+    stride_ik: tl.constexpr,
+    stride_scm: tl.constexpr,
+    stride_scn: tl.constexpr,
+    E: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_E: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    norm_topk_probs: tl.constexpr,
+):
+    """
+    Full topk(softmax()) backward over ALL E indices.
+    Forward: logits → p = softmax(logits) → [raw, idx] = topk(p, K)
+             → scores = raw / sum(raw)  (if norm_topk_probs)
+    Backward:
+      1. Recompute p = softmax(logits) over all E
+      2. If renorm: dp_sel = (dscore - dot_s) / S
+         Else:      dp_sel = dscore
+      3. dot = Σ dp_sel_j * p_sel_j
+      4. Scatter dp_sel into E-wide dp (zero at non-selected)
+      5. dlogits = p * (dp - dot)  for all E
+    """
+    row = tl.program_id(axis=0)
+    e_offs = tl.arange(0, BLOCK_E)
+    e_mask = e_offs < E
+    logits = tl.load(logits_ptr + row * stride_lm + e_offs * stride_le, mask=e_mask, other=-float("inf")).to(
+        tl.float32
+    )
+    row_max = tl.max(logits, axis=0)
+    exp_vals = tl.exp(logits - row_max)
+    row_sum = tl.sum(exp_vals, axis=0)
+    p = exp_vals / row_sum  # (BLOCK_E,)
+    # --- Load K selected indices and upstream gradient ---
+    k_offs = tl.arange(0, BLOCK_K)
+    k_mask = k_offs < K
+    idx = tl.load(
+        idx_ptr + row * stride_im + k_offs * stride_ik,
+        mask=k_mask,
+        other=0,
+    ).to(tl.int32)
+    g_sel = tl.load(
+        dscore_ptr + row * stride_sm + k_offs * stride_sn,
+        mask=k_mask,
+        other=0,
+    ).to(tl.float32)
+    # p at selected indices (gather from global mem; can't index register tensor)
+    sel_logits = tl.load(
+        logits_ptr + row * stride_lm + idx * stride_le,
+        mask=k_mask,
+        other=-float("inf"),
+    ).to(tl.float32)
+    p_sel = tl.exp(sel_logits - row_max) / row_sum  # (BLOCK_K,)
+    # --- Backward through optional renormalization ---
+    if norm_topk_probs:
+        scores = tl.load(
+            score_ptr + row * stride_scm + k_offs * stride_scn,
+            mask=k_mask,
+            other=0,
+        ).to(tl.float32)
+        dot_s = tl.sum(g_sel * scores, axis=0)
+        S = tl.sum(p_sel, axis=0)
+        dp_sel = (g_sel - dot_s) / S
+    else:
+        dp_sel = g_sel
+    # dot = Σ dp_sel_j * p_sel_j
+    dot = tl.sum(dp_sel * p_sel, axis=0)
+    # --- Scatter dp_sel into N-wide dp ---
+    # dp[i] = dp_sel[k] if i == idx[k], else 0
+    # Loop over K (unrolled at compile time since K is constexpr)
+    dp = tl.zeros([BLOCK_E], dtype=tl.float32)
+    for k_iter in tl.static_range(K):
+        cur_dp = tl.sum(tl.where(k_offs == k_iter, dp_sel, 0.0))
+        cur_idx = tl.sum(tl.where(k_offs == k_iter, idx, 0))
+        dp = tl.where(e_offs == cur_idx, cur_dp, dp)
+    # --- dlogits = p * (dp - dot) for all E ---
+    dlogits = p * (dp - dot)
+    tl.store(
+        dlogits_ptr + row * stride_dm + e_offs * stride_dn,
+        dlogits,
+        mask=e_mask,
+    )
+@torch.library.custom_op(add_op_namespace_prefix("_topk_softmax_bwd"), mutates_args={"dlogits_full"})
+def _topk_softmax_bwd(
+    router_logits: torch.Tensor,
     dlogits_full: torch.Tensor,
     dlogits: Optional[torch.Tensor],
     dtopk_score: torch.Tensor,
     topk_router_score: torch.Tensor,
     topk_router_indices: torch.Tensor,
+    E: int,
     K: int,
+    is_softmax_over_topk: bool = True,
+    norm_topk_probs: bool = False,
 ) -> None:
     T = dtopk_score.shape[0]
+    if is_softmax_over_topk:
+        # non-selected gradient is zero.
+        _softmax_over_topk_bwd_kernel[T,](
+            dlogits,
+            dlogits_full,
+            topk_router_score,
+            dtopk_score,
+            topk_router_indices,
+            dlogits_full.stride(0),
+            dlogits_full.stride(1),
+            topk_router_score.stride(0),
+            topk_router_score.stride(1),
+            dtopk_score.stride(0),
+            dtopk_score.stride(1),
+            topk_router_indices.stride(0),
+            topk_router_indices.stride(1),
+            K,
+            triton.next_power_of_2(K),
+            (dlogits is None),
+        )
+    else:
+        # topk(softmax(.)): non-selected gradient is -p_i * dot, NOT zero.
+        # must recompute full softmax for the complete Jacobian.
+        _topk_over_softmax_bwd_kernel[T,](
+            router_logits,
+            dlogits_full,
+            dtopk_score,
+            topk_router_indices,
+            topk_router_score,
+            router_logits.stride(0),
+            router_logits.stride(1),
+            dlogits_full.stride(0),
+            dlogits_full.stride(1),
+            dtopk_score.stride(0),
+            dtopk_score.stride(1),
+            topk_router_indices.stride(0),
+            topk_router_indices.stride(1),
+            topk_router_score.stride(0),
+            topk_router_score.stride(1),
+            E,
+            K,
+            triton.next_power_of_2(E),
+            triton.next_power_of_2(K),
+            norm_topk_probs,
+        )
 @triton.jit

build/torch-cuda/functional/forward.py CHANGED Viewed

@@ -9,18 +9,21 @@ import triton
 import triton.language as tl
 from cutlass.cute.runtime import from_dlpack
 from ..quack.cute_dsl_utils import torch2cute_dtype_map
-from ..enums import LIBRARY_NAME, TENSORMAP, ActivationType
 from .._ops_compat import add_op_namespace_prefix
-from ..utils import convert_torch_tensor_to_cute_tensor
-from .moe_config import HopperWgmma_MoE_Down_proj_Fwd, HopperWgmma_MoE_Up_proj_Fwd
 from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
-from .topk_softmax import TopK_Softmax
 @torch.library.custom_op(add_op_namespace_prefix("_topk_fwd"), mutates_args={"values", "indices"})
 def _topk_fwd(
-    x: torch.Tensor, k: int, values: torch.Tensor, indices: torch.Tensor, require_softmax_fusion: bool = True
 ) -> None:
     """Top-k forward pass.
     Args:
@@ -39,9 +42,17 @@ def _topk_fwd(
     x_tensor, values_tensor, indices_tensor = [convert_from_dlpack(tensor) for tensor in (x, values, indices)]
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compile_key = (input_dtype, output_dtype, N, k, require_softmax_fusion)
     if compile_key not in _topk_fwd.compile_cache:
-        topk_op = TopK_Softmax(input_dtype, output_dtype, N, k, require_softmax_fusion)
         _topk_fwd.compile_cache[compile_key] = cute.compile(
             topk_op, x_tensor, values_tensor, indices_tensor, current_stream
         )
@@ -51,129 +62,49 @@ def _topk_fwd(
 _topk_fwd.compile_cache = {}
-@torch.library.custom_op(add_op_namespace_prefix("_up_projection_forward"), mutates_args={"z", "y1"})
 def _up_projection_forward(
     x: torch.Tensor,
     w1: torch.Tensor,
-    z: torch.Tensor,
-    y1: torch.Tensor,
     b1: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor,
     x_gather_idx: torch.Tensor,
-    stream_id: int,
     activation_type: str,
-    is_glu_activation: bool,
     is_inference_mode_enabled: bool = False,
 ) -> None:
-    I, H, E = w1.size()
-    if is_glu_activation:
-        I //= 2
-    mX = convert_torch_tensor_to_cute_tensor(x.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-    mW1 = convert_torch_tensor_to_cute_tensor(w1.detach(), (2, 0, 1), 1, 16, 8, stream=stream_id)
-    mZ = convert_torch_tensor_to_cute_tensor(z, (0, 1), 1, 16, 8, stream=stream_id)
-    mY1 = convert_torch_tensor_to_cute_tensor(y1, (0, 1), 1, 16, 8, stream=stream_id)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    if b1 is None:
-        mB1 = None
-    else:
-        mB1 = convert_torch_tensor_to_cute_tensor(b1.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    compile_w1_key = (E, H, I, (b1 is None), x.dtype, activation_type, is_inference_mode_enabled)
-    if compile_w1_key not in _up_projection_forward.compile_cache:
-        w1_module = HopperWgmma_MoE_Up_proj_Fwd(
-            E, H, I, activation_type=ActivationType(activation_type), inference_mode=is_inference_mode_enabled
-        )
-        tensormaps = [w1_module.module.generate_tensormap(None, None, None) for _ in range(2)]
-        _up_projection_forward.compile_cache[compile_w1_key] = cute.compile(
-            w1_module,
-            mX,
-            mW1,
-            mZ,
-            mY1,
-            mB1,
-            mE_offset,
-            mX_gather,
-            tensormaps[0],
-            tensormaps[1],
-            mE_permute_order,
-            current_stream,
-        )
-        _up_projection_forward.compile_cache[TENSORMAP] = tensormaps
-    w1_tensormaps = _up_projection_forward.compile_cache[TENSORMAP]
-    _up_projection_forward.compile_cache[compile_w1_key](
-        mX,
-        mW1,
-        mZ,
-        mY1,
-        mB1,
-        mE_offset,
-        mX_gather,
-        w1_tensormaps[0],
-        w1_tensormaps[1],
-        mE_permute_order,
-        current_stream,
     )
 _up_projection_forward.compile_cache = {}
-@torch.library.custom_op(add_op_namespace_prefix("_down_projection_forward"), mutates_args={"y2"})
 def _down_projection_forward(
     w2: torch.Tensor,
-    y1: torch.Tensor,
-    y2: torch.Tensor,
     b2: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
-    expert_schedule_order: torch.Tensor,
-    x_gather_idx: torch.Tensor,
-    stream_id: int,
 ) -> None:
-    H, I, E = w2.size()
-    mW2 = convert_torch_tensor_to_cute_tensor(w2.detach(), (2, 0, 1), 1, 16, 8, stream=stream_id)
-    mY1 = convert_torch_tensor_to_cute_tensor(y1.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-    mY2 = convert_torch_tensor_to_cute_tensor(y2, (0, 1), 1, 16, 8, stream=stream_id)
-    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
-    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
-    if expert_schedule_order is None:
-        mE_permute_order = None
-    else:
-        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
-    if b2 is None:
-        mB2 = None
-    else:
-        mB2 = convert_torch_tensor_to_cute_tensor(b2.detach(), (0, 1), 1, 16, 8, stream=stream_id)
-    current_stream = cuda.CUstream(stream_id)
-    compile_w2_key = (E, H, I, (b2 is None), w2.dtype)
-    if compile_w2_key not in _down_projection_forward.compile_cache:
-        w2_module = HopperWgmma_MoE_Down_proj_Fwd(E, H, I)
-        tensormaps = [w2_module.module.generate_tensormap(None, None, None) for _ in range(1)]
-        _down_projection_forward.compile_cache[compile_w2_key] = cute.compile(
-            w2_module, mY1, mW2, mY2, mB2, mE_offset, mX_gather, tensormaps[0], mE_permute_order, current_stream
-        )
-        _down_projection_forward.compile_cache[TENSORMAP] = tensormaps
-    w2_tensormaps = _down_projection_forward.compile_cache[TENSORMAP]
-    _down_projection_forward.compile_cache[compile_w2_key](
-        mY1, mW2, mY2, mB2, mE_offset, mX_gather, w2_tensormaps[0], mE_permute_order, current_stream
-    )
 _down_projection_forward.compile_cache = {}
@@ -181,7 +112,7 @@ _down_projection_forward.compile_cache = {}
 @torch.library.custom_op(add_op_namespace_prefix("_router_forward"), mutates_args={"o"})
 def _router_forward(
-    y2: torch.Tensor,
     o: torch.Tensor,
     topk_scores: torch.Tensor,
     s_reverse_scatter_idx: torch.Tensor,
@@ -191,7 +122,7 @@ def _router_forward(
     is_varlen_K: bool,
 ) -> None:
     token_gather_and_sum_varlen_K_triton(
-        y2,
         topk_scores,
         o,
         s_reverse_scatter_idx,
@@ -225,14 +156,35 @@ def _softmax_fwd_small_kernel(
 @torch.library.custom_op(
     add_op_namespace_prefix("_softmax_topk_fwd"), mutates_args={"topk_router_score", "topk_router_indices"}
 )
-def _softmax_topk_fwd(
-    router_logits: torch.Tensor, topk_router_score: torch.Tensor, topk_router_indices: torch.Tensor, E: int, K: int
 ) -> None:
-    # T = router_logits.shape[0]
     if E <= 4096 and K <= 16 and E % 8 == 0:
-        # fast topk-softmax fusion that covers most common MoE configs
-        _topk_fwd(router_logits, K, topk_router_score, topk_router_indices, require_softmax_fusion=True)
     else:
-        topk_results = router_logits.topk(K, dim=-1)
-        topk_router_score.copy_(topk_results.values.softmax(dim=-1, dtype=torch.float32).to(topk_router_score.dtype))
-        topk_router_indices.copy_(topk_results.indices.to(topk_router_indices.dtype))

 import triton.language as tl
 from cutlass.cute.runtime import from_dlpack
 from ..quack.cute_dsl_utils import torch2cute_dtype_map
+from ..quack.gemm_interface import gemm, gemm_gated
 from .._ops_compat import add_op_namespace_prefix
 from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
+from .topk import Softmax_Over_TopK, TopK_Over_Softmax
 @torch.library.custom_op(add_op_namespace_prefix("_topk_fwd"), mutates_args={"values", "indices"})
 def _topk_fwd(
+    x: torch.Tensor,
+    k: int,
+    values: torch.Tensor,
+    indices: torch.Tensor,
+    is_softmax_over_topk: bool,
+    norm_topk_probs: bool,
 ) -> None:
     """Top-k forward pass.
     Args:
     x_tensor, values_tensor, indices_tensor = [convert_from_dlpack(tensor) for tensor in (x, values, indices)]
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    if is_softmax_over_topk:
+        compile_key = (input_dtype, output_dtype, N, k, True)
+    else:
+        compile_key = (input_dtype, output_dtype, N, k, False, norm_topk_probs)
     if compile_key not in _topk_fwd.compile_cache:
+        if is_softmax_over_topk:
+            topk_op = Softmax_Over_TopK(input_dtype, output_dtype, N, k)
+        else:
+            topk_op = TopK_Over_Softmax(input_dtype, output_dtype, N, k, norm_topk_probs)
         _topk_fwd.compile_cache[compile_key] = cute.compile(
             topk_op, x_tensor, values_tensor, indices_tensor, current_stream
         )
 _topk_fwd.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_up_projection_forward"), mutates_args={"h", "a"})
 def _up_projection_forward(
     x: torch.Tensor,
     w1: torch.Tensor,
+    h: torch.Tensor,
+    a: torch.Tensor,
     b1: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
     x_gather_idx: torch.Tensor,
     activation_type: str,
     is_inference_mode_enabled: bool = False,
+    concat_layout: bool = False,
 ) -> None:
+    assert activation_type in (
+        "swiglu",
+        "geglu",
+    ), f"QuACK gemm_gated only supports glu activations, got {activation_type}"
+    gemm_gated(
+        x,
+        w1.permute(2, 1, 0),
+        activation=activation_type,
+        cu_seqlens_m=expert_frequency_offset,
+        A_idx=x_gather_idx,
+        preact_out=h,
+        postact_out=a,
+        store_preact=(not is_inference_mode_enabled),
+        bias=b1,
+        concat_layout=(("B", "bias") if b1 is not None else ("B",)) if concat_layout else None,
     )
 _up_projection_forward.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_down_projection_forward"), mutates_args={"y"})
 def _down_projection_forward(
     w2: torch.Tensor,
+    a: torch.Tensor,
+    y: torch.Tensor,
     b2: torch.Tensor | None,
     expert_frequency_offset: torch.Tensor,
 ) -> None:
+    gemm(a, w2.permute(2, 1, 0), out=y, cu_seqlens_m=expert_frequency_offset, bias=b2)
 _down_projection_forward.compile_cache = {}
 @torch.library.custom_op(add_op_namespace_prefix("_router_forward"), mutates_args={"o"})
 def _router_forward(
+    y: torch.Tensor,
     o: torch.Tensor,
     topk_scores: torch.Tensor,
     s_reverse_scatter_idx: torch.Tensor,
     is_varlen_K: bool,
 ) -> None:
     token_gather_and_sum_varlen_K_triton(
+        y,
         topk_scores,
         o,
         s_reverse_scatter_idx,
 @torch.library.custom_op(
     add_op_namespace_prefix("_softmax_topk_fwd"), mutates_args={"topk_router_score", "topk_router_indices"}
 )
+def _topk_softmax_fwd(
+    router_logits: torch.Tensor,
+    topk_router_score: torch.Tensor,
+    topk_router_indices: torch.Tensor,
+    E: int,
+    K: int,
+    is_softmax_over_topk: bool,
+    norm_topk_probs: bool,
 ) -> None:
     if E <= 4096 and K <= 16 and E % 8 == 0:
+        _topk_fwd(
+            router_logits,
+            K,
+            topk_router_score,
+            topk_router_indices,
+            is_softmax_over_topk=is_softmax_over_topk,
+            norm_topk_probs=norm_topk_probs,
+        )
     else:
+        if is_softmax_over_topk:
+            topk_results = router_logits.topk(K, dim=-1)
+            vals = topk_results.values.softmax(dim=-1, dtype=torch.float32)
+            topk_router_score.copy_(vals.to(topk_router_score.dtype))
+            topk_router_indices.copy_(topk_results.indices.to(topk_router_indices.dtype))
+        else:
+            probs = router_logits.softmax(dim=-1, dtype=torch.float32)
+            topk_results = probs.topk(K, dim=-1)
+            vals = topk_results.values
+            if norm_topk_probs:
+                vals = vals / vals.sum(dim=-1, keepdim=True)
+            topk_router_score.copy_(vals.to(topk_router_score.dtype))
+            topk_router_indices.copy_(topk_results.indices.to(topk_router_indices.dtype))

build/torch-cuda/functional/grouped_gemm.py DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/functional/moe_config.py DELETED Viewed

@@ -1,581 +0,0 @@
-# ********************************************************************************
-# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
-# ********************************************************************************
-import math
-from dataclasses import dataclass
-import cuda.bindings.driver as cuda
-import cutlass
-import cutlass.cute as cute
-import torch
-from cutlass import const_expr
-from ..quack.tile_scheduler import RasterOrderOption
-from ..enums import ActivationType, is_glu
-from .grouped_gemm import HopperWgmma_MoE_kernel
-LIBRARY_NAME = "cutedsl_kernels"
-def ceil_div(a: int, b: int):
-    return int(math.ceil(a / b))
-@dataclass
-class HopperGEMMConfig:
-    tile_shape_mnk: cutlass.Constexpr[cute.Shape] = (128, 256, 64)
-    cluster_shape_mnk: cutlass.Constexpr[cute.Shape] = (2, 1)
-    epi_tile_size: cutlass.Constexpr[int] = 32
-    ## assume we always use persistent kernel
-    # is_persistent: cutlass.Constexpr[bool] = True
-    is_pingpong: cutlass.Constexpr[bool] = False
-    raster_order: RasterOrderOption = RasterOrderOption.Heuristic
-    L2_group_size: int = 8
-    initial_d_epi_stage: cutlass.Constexpr[int] = 4
-class HopperWgmma_MoE_Up_proj_Fwd:
-    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType, inference_mode=False):
-        super().__init__()
-        is_glu_activation = is_glu(activation_type)
-        if is_glu_activation:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 64 == 0
-            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        else:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 128 == 0
-            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
-        # TODO: this assertion does not mean that the MoE impl prohibits such config.
-        # Instead, we just do not search for the best configs manually yet for small-shaped MoE
-        if (I >= 128 and is_glu_activation) or (I >= 256 and not is_glu_activation):
-            up_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 256, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=(32 if not inference_mode else 64),
-                is_pingpong=False,
-                initial_d_epi_stage=2,
-                raster_order=RasterOrderOption.AlongM,
-            )
-        elif (I == 64 and is_glu_activation) or (I == 128 and not is_glu_activation):
-            up_config = HopperGEMMConfig(
-                tile_shape_mnk=(192, 128, 64),
-                cluster_shape_mnk=(1, 1),
-                epi_tile_size=(32 if not inference_mode else 64),
-                is_pingpong=True,
-                initial_d_epi_stage=8,
-                raster_order=RasterOrderOption.AlongM,
-            )
-        else:
-            raise NotImplementedError()
-        compute_swiglu = False
-        compute_geglu = False
-        compute_reglu = False
-        compute_relu_sq = False
-        compute_silu = False
-        compute_relu = False
-        compute_gelu = False
-        if activation_type == ActivationType.SWIGLU:
-            compute_swiglu = True
-        elif activation_type == ActivationType.GEGLU:
-            compute_geglu = True
-        elif activation_type == ActivationType.REGLU:
-            compute_reglu = True
-        elif activation_type == ActivationType.RELU_SQ:
-            compute_relu_sq = True
-        elif activation_type == ActivationType.RELU:
-            compute_relu = True
-        elif activation_type == ActivationType.SILU:
-            compute_silu = True
-        elif activation_type == ActivationType.GELU:
-            compute_gelu = True
-        else:
-            raise NotImplementedError(f"Activation function {activation_type} not supported yet!")
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            up_config.tile_shape_mnk,
-            (*up_config.cluster_shape_mnk, 1),
-            pingpong=up_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=compute_swiglu,
-            compute_reglu=compute_reglu,
-            compute_geglu=compute_geglu,
-            compute_relu_sq=compute_relu_sq,
-            compute_relu=compute_relu,
-            compute_silu=compute_silu,
-            compute_gelu=compute_gelu,
-            is_A_gather=True,
-            epi_tile_size=up_config.epi_tile_size,
-            initial_d_epi_stage=up_config.initial_d_epi_stage,
-            inference_mode=inference_mode,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            up_config.cluster_shape_mnk[0] * up_config.cluster_shape_mnk[1]
-        )
-        self.current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    @cute.jit
-    def __call__(
-        self, mX, mW1, mZ, mY1, mB1, mE_offset, mX_gather, mD_tensormap, mY1_tensormap, mE_permute_order, stream
-    ):
-        return self.module(
-            mX,
-            mW1,
-            None,
-            mB1,
-            mZ,
-            mY1,
-            None,
-            None,
-            mE_offset,
-            mX_gather,
-            None,
-            None,
-            None,
-            None,
-            None,
-            mD_tensormap,
-            mY1_tensormap,
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )
-class HopperWgmma_MoE_Down_proj_Fwd:
-    def __init__(self, E: int, H: int, I: int):
-        super().__init__()
-        assert (
-            H % 64 == 0 and H >= 512 and I % 64 == 0
-        ), f"{LIBRARY_NAME} only supports MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        if I >= 1024:
-            down_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 256, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=32,
-                is_pingpong=False,
-                initial_d_epi_stage=4,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        elif I >= 256:
-            down_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 192, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=(96 if H % 96 == 0 else 64),
-                is_pingpong=True,
-                initial_d_epi_stage=5,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        elif I >= 64:
-            down_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 192, 64),
-                cluster_shape_mnk=(1, 2),
-                epi_tile_size=64,
-                is_pingpong=True,
-                initial_d_epi_stage=8,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        else:
-            raise NotImplementedError()
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            down_config.tile_shape_mnk,
-            (*down_config.cluster_shape_mnk, 1),
-            pingpong=down_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=False,
-            is_A_gather=False,
-            epi_tile_size=down_config.epi_tile_size,
-            initial_d_epi_stage=down_config.initial_d_epi_stage,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            down_config.cluster_shape_mnk[0] * down_config.cluster_shape_mnk[1]
-        )
-    @cute.jit
-    def __call__(self, mY1, mW2, mY2, mB2, mE_offset, mX_gather, mD_tensormap, mE_permute_order, stream):
-        # we are not really using mX_gather in the Grouped GEMM,
-        # but CuTe-DSL compiler disallows dynamic flow so we still need to pass this argument
-        return self.module(
-            mY1,
-            mW2,
-            None,
-            mB2,
-            mY2,
-            None,
-            None,
-            None,
-            mE_offset,
-            mX_gather,
-            None,
-            None,
-            None,
-            None,
-            None,
-            mD_tensormap,
-            None,
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )
-class HopperWgmma_MoE_Down_proj_ActGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType):
-        super().__init__()
-        is_glu_activation = is_glu(activation_type)
-        if is_glu_activation:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 64 == 0
-            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        else:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 128 == 0
-            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
-        # heavy register pressure due to pingpong + heavy epilogue
-        #   effectively no alternatives to this config
-        dz_partial_ds_config = HopperGEMMConfig(
-            tile_shape_mnk=(128, 128, 64),
-            cluster_shape_mnk=(2, 1),
-            epi_tile_size=32,
-            initial_d_epi_stage=4,
-            is_pingpong=True,
-            raster_order=RasterOrderOption.Heuristic,
-        )
-        compute_swiglu = False
-        compute_geglu = False
-        compute_reglu = False
-        compute_relu_sq = False
-        compute_silu = False
-        compute_relu = False
-        compute_gelu = False
-        if activation_type == ActivationType.SWIGLU:
-            compute_swiglu = True
-        elif activation_type == ActivationType.GEGLU:
-            compute_geglu = True
-        elif activation_type == ActivationType.REGLU:
-            compute_reglu = True
-        elif activation_type == ActivationType.RELU_SQ:
-            compute_relu_sq = True
-        elif activation_type == ActivationType.RELU:
-            compute_relu = True
-        elif activation_type == ActivationType.SILU:
-            compute_silu = True
-        elif activation_type == ActivationType.GELU:
-            compute_gelu = True
-        else:
-            raise NotImplementedError(f"Activation function {activation_type} not supported yet!")
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            dz_partial_ds_config.tile_shape_mnk,
-            (*dz_partial_ds_config.cluster_shape_mnk, 1),
-            pingpong=dz_partial_ds_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=compute_swiglu,
-            compute_reglu=compute_reglu,
-            compute_geglu=compute_geglu,
-            compute_relu_sq=compute_relu_sq,
-            compute_relu=compute_relu,
-            compute_silu=compute_silu,
-            compute_gelu=compute_gelu,
-            compute_dz_and_partial_ds_and_y1s=True,
-            is_A_gather=True,
-            epi_tile_size=dz_partial_ds_config.epi_tile_size,
-            initial_d_epi_stage=dz_partial_ds_config.initial_d_epi_stage,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            dz_partial_ds_config.cluster_shape_mnk[0] * dz_partial_ds_config.cluster_shape_mnk[1]
-        )
-    @cute.jit
-    def __call__(
-        self,
-        mDout,
-        mW2_trans,
-        mZ_FP32_if_GLU_else_BF16,
-        mDz_FP32_if_GLU_else_BF16,
-        mY1S,
-        mS,
-        mDS_partial,
-        mE_offset,
-        mX_gather,
-        mS_scatter,
-        tensormaps,
-        mE_permute_order,
-        stream,
-    ):
-        return self.module(
-            mDout,
-            mW2_trans,
-            mZ_FP32_if_GLU_else_BF16,
-            None,
-            mDz_FP32_if_GLU_else_BF16,
-            mY1S,
-            mS,
-            mDS_partial,
-            mE_offset,
-            mX_gather,
-            None,
-            mS_scatter,
-            None,
-            None,
-            tensormaps[0],
-            tensormaps[1],
-            tensormaps[2],
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )
-class HopperWgmma_MoE_Down_proj_WeightGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int):
-        super().__init__()
-        assert (
-            H % 64 == 0 and H >= 512 and I % 64 == 0
-        ), f"{LIBRARY_NAME} only supports MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        if I >= 128:
-            dw2_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 256, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=16,
-                is_pingpong=False,
-                initial_d_epi_stage=6,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        elif I == 64:
-            dw2_config = HopperGEMMConfig(
-                tile_shape_mnk=(64, 192, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=32,
-                is_pingpong=True,
-                initial_d_epi_stage=6,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        else:
-            raise NotImplementedError()
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            dw2_config.tile_shape_mnk,
-            (*dw2_config.cluster_shape_mnk, 1),
-            pingpong=dw2_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=False,
-            compute_weight_gradient=True,
-            compute_dz_and_partial_ds_and_y1s=False,
-            is_A_gather=True,
-            epi_tile_size=dw2_config.epi_tile_size,
-            initial_d_epi_stage=dw2_config.initial_d_epi_stage,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            dw2_config.cluster_shape_mnk[0] * dw2_config.cluster_shape_mnk[1]
-        )
-    @cute.jit
-    def __call__(self, mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
-        return self.module(
-            mDout_trans,
-            mY1S_trans,
-            None,
-            None,
-            mDw2,
-            None,
-            None,
-            None,
-            mE_offset,
-            mX_gather,
-            None,
-            None,
-            None,
-            tensormaps[0],
-            None,
-            None,
-            None,
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )
-class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
-        super().__init__()
-        if is_glu_activation:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 64 == 0
-            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        else:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 128 == 0
-            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
-        if (I >= 512 and is_glu_activation) or (I >= 1024 and not is_glu_activation):
-            dx_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 256, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=32,
-                is_pingpong=False,
-                initial_d_epi_stage=4,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        elif (I >= 64 and is_glu_activation) or (I >= 128 and not is_glu_activation):
-            dx_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 192, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=64,
-                is_pingpong=True,
-                initial_d_epi_stage=8,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        else:
-            raise NotImplementedError()
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            dx_config.tile_shape_mnk,
-            (*dx_config.cluster_shape_mnk, 1),
-            pingpong=dx_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=False,
-            compute_dz_and_partial_ds_and_y1s=False,
-            is_A_gather=False,
-            epi_tile_size=dx_config.epi_tile_size,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            dx_config.cluster_shape_mnk[0] * dx_config.cluster_shape_mnk[1]
-        )
-        self.current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    @cute.jit
-    def __call__(
-        self, mDz, mW1_trans, mDx_expanded, mE_offset, mX_gather, mS_scatter, tensormaps, mE_permute_order, stream
-    ):
-        return self.module(
-            mDz,
-            mW1_trans,
-            None,
-            None,
-            mDx_expanded,
-            None,
-            None,
-            None,
-            mE_offset,
-            mX_gather,
-            None,
-            mS_scatter,
-            None,
-            None,
-            None,
-            tensormaps[0],
-            tensormaps[1],
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )
-class HopperWgmma_MoE_Up_proj_WeightGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
-        super().__init__()
-        if is_glu_activation:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 64 == 0
-            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
-        else:
-            assert (
-                H % 64 == 0 and H >= 512 and I % 128 == 0
-            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
-        if (I >= 128 and is_glu_activation) or (I >= 256 and not is_glu_activation):
-            dw1_config = HopperGEMMConfig(
-                tile_shape_mnk=(128, 256, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=16,
-                is_pingpong=False,
-                initial_d_epi_stage=6,
-                raster_order=RasterOrderOption.Heuristic,
-            )
-        elif (I == 64 and is_glu_activation) or (I == 128 and not is_glu_activation):
-            dw1_config = HopperGEMMConfig(
-                tile_shape_mnk=(256, 128, 64),
-                cluster_shape_mnk=(2, 1),
-                epi_tile_size=16,
-                is_pingpong=False,
-                initial_d_epi_stage=6,
-                raster_order=RasterOrderOption.AlongN,
-            )
-        else:
-            raise NotImplementedError()
-        self.module = HopperWgmma_MoE_kernel(
-            E,
-            cutlass.Float32,
-            dw1_config.tile_shape_mnk,
-            (*dw1_config.cluster_shape_mnk, 1),
-            pingpong=dw1_config.is_pingpong,
-            is_persistent=True,
-            compute_swiglu=False,
-            compute_weight_gradient=True,
-            compute_dz_and_partial_ds_and_y1s=False,
-            is_A_gather=True,
-            epi_tile_size=dw1_config.epi_tile_size,
-        )
-        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
-            dw1_config.cluster_shape_mnk[0] * dw1_config.cluster_shape_mnk[1]
-        )
-    @cute.jit
-    def __call__(self, mX_trans, mDz_trans, mDw1_trans, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
-        return self.module(
-            mX_trans,
-            mDz_trans,
-            None,
-            None,
-            mDw1_trans,
-            None,
-            None,
-            None,
-            mE_offset,
-            mX_gather,
-            None,
-            None,
-            None,
-            tensormaps[0],
-            None,
-            None,
-            None,
-            None,
-            mE_permute_order,
-            const_expr(self.max_active_clusters),
-            stream,
-        )

build/torch-cuda/functional/reduction_over_k_gather.py CHANGED Viewed

@@ -11,9 +11,6 @@ import triton.language as tl
 from ..utils import get_powers_of_2
-### This triton impl is equivalent as the cute-dsl impl shown above,
-# and also achieves similar memory bandwidth on H100 for large K and H.
-# However, for small K and H, this impl is better by autotuning so we use it as the default.
 def _get_triton_autotune_configs() -> list[triton.Config]:
     configs = []
     for BLOCK_H in get_powers_of_2(256, 4096):

 from ..utils import get_powers_of_2
 def _get_triton_autotune_configs() -> list[triton.Config]:
     configs = []
     for BLOCK_H in get_powers_of_2(256, 4096):

build/torch-cuda/functional/{topk_softmax.py → topk.py} RENAMED Viewed

@@ -4,12 +4,14 @@
 # this impl is adapted from QuACK's topk https://github.com/Dao-AILab/quack/blob/main/quack/topk.py
 import math
 from typing import Type
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
-from ..quack import utils
 from cutlass import const_expr
 from ..quack.sort.bitonic_sort import bitonic_topk
 from triton import next_power_of_2
@@ -17,14 +19,23 @@ from triton import next_power_of_2
 from ..utils import domain_offset_i64
-class TopK_Softmax:
     def __init__(
         self,
         input_dtype: Type[cutlass.Numeric],
         output_dtype: Type[cutlass.Numeric],
         N: int,
         k: int,
-        require_softmax_fusion: bool = True,
     ):
         self.input_dtype = input_dtype
         self.output_dtype = output_dtype
@@ -38,11 +49,13 @@ class TopK_Softmax:
         assert N <= 4096 and N % 8 == 0
         assert input_dtype.width <= output_dtype.width, "input bitwidth must <= output bitwidth"
-        self.require_softmax_fusion = require_softmax_fusion
     def _calculate_threads_per_row(self):
-        # we want num_elems_per_thread >= self.k
-        # and each thread can handle at most 64 elements
         N = self.next_power_of_2_N
         num_threads_per_row = max(min(N // self.k, 32, N // 64), 1)
         return num_threads_per_row
@@ -78,7 +91,7 @@ class TopK_Softmax:
         output_tiler_mn, output_tv_layout = self._get_tv_layout(self.output_vecsize)
         num_threads = cute.size(input_tv_layout, mode=[0])
-        self.kernel(mX, mValues, mIndices, input_tv_layout, input_tiler_mn, output_tv_layout, output_tiler_mn).launch(
             grid=[cute.ceil_div(mX.shape[0], input_tiler_mn[0]), 1, 1],
             block=[num_threads, 1, 1],
             stream=stream,
@@ -93,7 +106,6 @@ class TopK_Softmax:
         input_tv_layout: cute.Layout,
         input_tiler_mn: cute.Shape,
         output_tv_layout: cute.Layout,
-        output_tiler_mn: cute.Shape,
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
@@ -106,7 +118,6 @@ class TopK_Softmax:
         gX = cute.local_tile(mX, input_tiler_mn, (0, 0))
         cX = cute.local_tile(idX, input_tiler_mn, (bidx, 0))
-        # declare the atoms which will be used later for memory copy
         copy_atom_load_X = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128)
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, input_tv_layout, input_tiler_mn).get_slice(tidx)
         tXgX = thr_copy_X.partition_S(gX)
@@ -117,7 +128,7 @@ class TopK_Softmax:
         is_even_N = const_expr(shape[1] == input_tiler_mn[1])
         tXpX = (
-            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
             if const_expr((not is_even_N) or (self.N != self.next_power_of_2_N))
             else None
         )
@@ -126,7 +137,67 @@ class TopK_Softmax:
         tXrX_f32 = cute.make_rmem_tensor(tXrX.shape, cutlass.Float32)
         tXrX_f32.store(tXrX.load().to(cutlass.Float32))
-        # Encode the indices into the bottom bits of values.
         log_N = int(math.log2(self.next_power_of_2_N))
         idx_mask = const_expr((1 << log_N) - 1)
         input_vecsize = cutlass.const_expr(input_tv_layout.shape[1][0])
@@ -162,7 +233,8 @@ class TopK_Softmax:
             col_idx = ~encoded_idx if topk_vals[i] >= 0 else encoded_idx
             topk_indices[i] = cutlass.Int32(col_idx & idx_mask)
-        if const_expr(self.require_softmax_fusion):
             topk_vals_max = -cutlass.Float32.inf
             for i in cutlass.range_constexpr(self.k):
                 topk_vals_max = cute.arch.fmax(topk_vals[i], topk_vals_max)
@@ -175,7 +247,18 @@ class TopK_Softmax:
             for i in cutlass.range_constexpr(self.k):
                 topk_vals[i] = topk_vals[i] / topk_exp_sum
-        # Convert cleaned values to output type
         topk_vals_out = cute.make_rmem_tensor_like(topk_indices, mValues.element_type)
         for i in cutlass.range_constexpr(self.k):
             topk_vals_out[i] = topk_vals[i].to(mValues.element_type)
@@ -193,3 +276,65 @@ class TopK_Softmax:
             for i in cutlass.range_constexpr(cute.size(topk_vals_out_store.shape, [1])):
                 cute.autovec_copy(topk_vals_out_store[None, i], mValues_store[None, i])
                 cute.autovec_copy(topk_indices_store[None, i], mIndices_store[None, i])

 # this impl is adapted from QuACK's topk https://github.com/Dao-AILab/quack/blob/main/quack/topk.py
 import math
+from enum import Enum
 from typing import Type
 import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
+from ..quack import copy_utils as copy_utils
+from ..quack import utils as utils
 from cutlass import const_expr
 from ..quack.sort.bitonic_sort import bitonic_topk
 from triton import next_power_of_2
 from ..utils import domain_offset_i64
+class _TopKMode(Enum):
+    SOFTMAX_OVER_TOPK = "softmax_over_topk"  # most common choice: softmax(topk(x))
+    TOPK_OVER_SOFTMAX = "topk_over_softmax"  # Qwen3:              topk(softmax(x))
+    TOPK_NO_FUSION = "topk"
+class _TopK:
+    """Private base class. Use TopK_Softmax, Softmax_TopK, or TopK instead."""
     def __init__(
         self,
         input_dtype: Type[cutlass.Numeric],
         output_dtype: Type[cutlass.Numeric],
         N: int,
         k: int,
+        mode: _TopKMode,
+        norm_topk_prob: bool = False,
     ):
         self.input_dtype = input_dtype
         self.output_dtype = output_dtype
         assert N <= 4096 and N % 8 == 0
         assert input_dtype.width <= output_dtype.width, "input bitwidth must <= output bitwidth"
+        self.mode = mode
+        if norm_topk_prob:
+            assert mode == _TopKMode.TOPK_OVER_SOFTMAX, "`norm_topk_prob` only works with softmax-then-topk"
+        self.norm_topk_prob = norm_topk_prob
     def _calculate_threads_per_row(self):
         N = self.next_power_of_2_N
         num_threads_per_row = max(min(N // self.k, 32, N // 64), 1)
         return num_threads_per_row
         output_tiler_mn, output_tv_layout = self._get_tv_layout(self.output_vecsize)
         num_threads = cute.size(input_tv_layout, mode=[0])
+        self.kernel(mX, mValues, mIndices, input_tv_layout, input_tiler_mn, output_tv_layout).launch(
             grid=[cute.ceil_div(mX.shape[0], input_tiler_mn[0]), 1, 1],
             block=[num_threads, 1, 1],
             stream=stream,
         input_tv_layout: cute.Layout,
         input_tiler_mn: cute.Shape,
         output_tv_layout: cute.Layout,
     ):
         tidx, _, _ = cute.arch.thread_idx()
         bidx, _, _ = cute.arch.block_idx()
         gX = cute.local_tile(mX, input_tiler_mn, (0, 0))
         cX = cute.local_tile(idX, input_tiler_mn, (bidx, 0))
         copy_atom_load_X = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128)
         thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, input_tv_layout, input_tiler_mn).get_slice(tidx)
         tXgX = thr_copy_X.partition_S(gX)
         is_even_N = const_expr(shape[1] == input_tiler_mn[1])
         tXpX = (
+            copy_utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
             if const_expr((not is_even_N) or (self.N != self.next_power_of_2_N))
             else None
         )
         tXrX_f32 = cute.make_rmem_tensor(tXrX.shape, cutlass.Float32)
         tXrX_f32.store(tXrX.load().to(cutlass.Float32))
+        # ------------------------------------------------------------------
+        # Softmax-then-TopK: full-row softmax → in-place log-prob transform.
+        # ------------------------------------------------------------------
+        if const_expr(self.mode == _TopKMode.TOPK_OVER_SOFTMAX):
+            if const_expr((not is_even_N) or (self.N != self.next_power_of_2_N)):
+                utils.fill_oob(tXrX_f32, tXpX, -tXrX_f32.element_type.inf)
+            threads_per_row_red = const_expr(self._calculate_threads_per_row())
+            num_threads_cta = const_expr(128 if self.next_power_of_2_N <= 16384 else 256)
+            # ---- thread-local (max, sum_exp) pair ----
+            local_max = -cutlass.Float32.inf
+            for i in cutlass.range_constexpr(cute.size(tXrX_f32)):
+                local_max = cute.arch.fmax(tXrX_f32[i], local_max)
+            local_sum = cutlass.Float32(0.0)
+            for i in cutlass.range_constexpr(cute.size(tXrX_f32)):
+                local_sum = local_sum + cute.math.exp(tXrX_f32[i] - local_max)
+            if const_expr(threads_per_row_red == 1):
+                row_max = local_max
+                row_sum = local_sum
+            else:
+                smem = cutlass.utils.SmemAllocator()
+                smem_layout = cute.make_ordered_layout((num_threads_cta,), order=(0,))
+                smem_max = smem.allocate_tensor(
+                    cutlass.Float32,
+                    smem_layout,
+                    byte_alignment=16,
+                )
+                smem_sum = smem.allocate_tensor(
+                    cutlass.Float32,
+                    smem_layout,
+                    byte_alignment=16,
+                )
+                row_in_blk = tidx // threads_per_row_red
+                smem_max[tidx] = local_max
+                smem_sum[tidx] = local_sum
+                cute.arch.barrier()
+                # Peel first partner: no exp needed
+                base = row_in_blk * threads_per_row_red
+                row_max = smem_max[base]
+                row_sum = smem_sum[base]
+                for p in cutlass.range_constexpr(1, self._calculate_threads_per_row()):
+                    p_max = smem_max[base + p]
+                    p_sum = smem_sum[base + p]
+                    if p_max > row_max:
+                        row_sum = row_sum * cute.math.exp(row_max - p_max) + p_sum
+                        row_max = p_max
+                    else:
+                        row_sum = row_sum + p_sum * cute.math.exp(p_max - row_max)
+            # In-place logit → log-probability
+            log_normalizer = row_max + cute.math.log(row_sum)
+            for i in cutlass.range_constexpr(cute.size(tXrX_f32)):
+                tXrX_f32[i] = tXrX_f32[i] - log_normalizer
+        # Encode indices into mantissa low bits.
         log_N = int(math.log2(self.next_power_of_2_N))
         idx_mask = const_expr((1 << log_N) - 1)
         input_vecsize = cutlass.const_expr(input_tv_layout.shape[1][0])
             col_idx = ~encoded_idx if topk_vals[i] >= 0 else encoded_idx
             topk_indices[i] = cutlass.Int32(col_idx & idx_mask)
+        # TopK-then-Softmax
+        if const_expr(self.mode == _TopKMode.SOFTMAX_OVER_TOPK):
             topk_vals_max = -cutlass.Float32.inf
             for i in cutlass.range_constexpr(self.k):
                 topk_vals_max = cute.arch.fmax(topk_vals[i], topk_vals_max)
             for i in cutlass.range_constexpr(self.k):
                 topk_vals[i] = topk_vals[i] / topk_exp_sum
+        # Softmax-then-TopK: recover probabilities from log-probs.
+        if const_expr(self.mode == _TopKMode.TOPK_OVER_SOFTMAX):
+            for i in cutlass.range_constexpr(self.k):
+                topk_vals[i] = cute.math.exp(topk_vals[i])
+            if const_expr(self.norm_topk_prob):
+                topk_sum = cutlass.Float32(0.0)
+                for i in cutlass.range_constexpr(self.k):
+                    topk_sum = topk_sum + topk_vals[i]
+                for i in cutlass.range_constexpr(self.k):
+                    topk_vals[i] = topk_vals[i] / topk_sum
         topk_vals_out = cute.make_rmem_tensor_like(topk_indices, mValues.element_type)
         for i in cutlass.range_constexpr(self.k):
             topk_vals_out[i] = topk_vals[i].to(mValues.element_type)
             for i in cutlass.range_constexpr(cute.size(topk_vals_out_store.shape, [1])):
                 cute.autovec_copy(topk_vals_out_store[None, i], mValues_store[None, i])
                 cute.autovec_copy(topk_indices_store[None, i], mIndices_store[None, i])
+class Softmax_Over_TopK(_TopK):
+    """softmax(topk(x))"""
+    def __init__(
+        self,
+        input_dtype: Type[cutlass.Numeric],
+        output_dtype: Type[cutlass.Numeric],
+        N: int,
+        k: int,
+    ):
+        mode = _TopKMode.SOFTMAX_OVER_TOPK
+        super().__init__(
+            input_dtype=input_dtype,
+            output_dtype=output_dtype,
+            N=N,
+            k=k,
+            mode=mode,
+        )
+class TopK_Over_Softmax(_TopK):
+    """Qwen3: topk(softmax(x))
+    When norm_topk_prob=True, renormalizes the K selected probabilities to sum to 1.
+    """
+    def __init__(
+        self,
+        input_dtype: Type[cutlass.Numeric],
+        output_dtype: Type[cutlass.Numeric],
+        N: int,
+        k: int,
+        norm_topk_prob: bool = True,
+    ):
+        super().__init__(
+            input_dtype=input_dtype,
+            output_dtype=output_dtype,
+            N=N,
+            k=k,
+            mode=_TopKMode.TOPK_OVER_SOFTMAX,
+            norm_topk_prob=norm_topk_prob,
+        )
+class TopK(_TopK):
+    """Raw topk — no softmax."""
+    def __init__(
+        self,
+        input_dtype: Type[cutlass.Numeric],
+        output_dtype: Type[cutlass.Numeric],
+        N: int,
+        k: int,
+    ):
+        super().__init__(
+            input_dtype=input_dtype,
+            output_dtype=output_dtype,
+            N=N,
+            k=k,
+            mode=_TopKMode.TOPK_NO_FUSION,
+        )

build/torch-cuda/functional/utils.py DELETED Viewed

@@ -1,25 +0,0 @@
-# ********************************************************************************
-# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
-# ********************************************************************************
-import os
-from contextlib import contextmanager
-_IS_USING_QUACK_GEMM = os.getenv("USE_QUACK_GEMM", "0") == "1"
-@contextmanager
-def enable_quack_gemm(enable: bool = True):
-    global _IS_USING_QUACK_GEMM
-    previous_value = _IS_USING_QUACK_GEMM
-    _IS_USING_QUACK_GEMM = enable
-    yield
-    _IS_USING_QUACK_GEMM = previous_value
-def is_using_quack_gemm() -> bool:
-    return _IS_USING_QUACK_GEMM

build/torch-cuda/metadata.json CHANGED Viewed

@@ -1,7 +1,9 @@
 {
   "version": 1,
   "license": "Apache-2.0",
   "python-depends": [
     "nvidia-cutlass-dsl"
   ],
   "backend": {

 {
+  "id": "_sonic_moe_cuda_a8c39a2",
   "version": 1,
   "license": "Apache-2.0",
   "python-depends": [
+    "tvm-ffi",
     "nvidia-cutlass-dsl"
   ],
   "backend": {

build/torch-cuda/quack/__init__.py CHANGED Viewed

@@ -1,8 +1,8 @@
-__version__ = "0.2.5"
 import os
 if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
-    from . import cute_dsl_ptxas
     cute_dsl_ptxas.patch()

+__version__ = "0.3.11"
 import os
 if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
+    from . import cute_dsl_ptxas  # noqa: F401
     cute_dsl_ptxas.patch()

build/torch-cuda/quack/_compile_worker.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2025, Tri Dao.
+# Persistent subprocess worker for parallel autotuning pre-compilation.
+# Receives length-prefixed pickled tasks on stdin, creates FakeTensors
+# matching the parent's tensor metadata, and compiles with COMPILE_ONLY=True.
+# Stays alive to process multiple configs (amortizes import overhead).
+import importlib
+import pickle
+import struct
+import sys
+import torch
+from torch._subclasses.fake_tensor import FakeTensorMode
+from . import cache_utils
+cache_utils.COMPILE_ONLY = True
+_dtype_map = {
+    "torch.float16": torch.float16,
+    "torch.bfloat16": torch.bfloat16,
+    "torch.float32": torch.float32,
+    "torch.float64": torch.float64,
+    "torch.int32": torch.int32,
+    "torch.int64": torch.int64,
+    "torch.int8": torch.int8,
+    "torch.uint8": torch.uint8,
+    "torch.bool": torch.bool,
+}
+def _make_fake_tensor(meta):
+    shape = meta["shape"]
+    stride = meta["stride"]
+    dtype = _dtype_map[meta["dtype"]]
+    return torch.empty_strided(shape, stride, dtype=dtype, device="cuda")
+def _recv(stream):
+    """Read a length-prefixed pickled message. Returns None on EOF."""
+    header = stream.read(4)
+    if len(header) < 4:
+        return None
+    length = struct.unpack("<I", header)[0]
+    if length == 0:
+        return None
+    data = stream.read(length)
+    return pickle.loads(data)
+def _send(stream, msg):
+    """Write a length-prefixed pickled message."""
+    data = pickle.dumps(msg)
+    stream.write(struct.pack("<I", len(data)))
+    stream.write(data)
+    stream.flush()
+def main():
+    stdin = sys.stdin.buffer
+    stdout = sys.stdout.buffer
+    # Signal ready
+    _send(stdout, "READY")
+    fn_cache = {}
+    while True:
+        payload = _recv(stdin)
+        if payload is None:
+            break
+        fn_module = payload["fn_module"]
+        fn_qualname = payload["fn_qualname"]
+        fn_key = (fn_module, fn_qualname)
+        if fn_key not in fn_cache:
+            mod = importlib.import_module(fn_module)
+            obj = mod
+            for part in fn_qualname.split("."):
+                obj = getattr(obj, part)
+            fn_cache[fn_key] = getattr(obj, "fn", obj)
+        fn = fn_cache[fn_key]
+        tensor_meta = payload["tensor_meta"]
+        kwargs = payload["kwargs"]
+        config_kwargs = payload["config_kwargs"]
+        with FakeTensorMode():
+            fake_args = []
+            for meta in tensor_meta:
+                if isinstance(meta, dict) and "shape" in meta:
+                    fake_args.append(_make_fake_tensor(meta))
+                else:
+                    fake_args.append(meta)
+            try:
+                fn(*fake_args, **kwargs, **config_kwargs)
+                _send(stdout, "OK")
+            except Exception as e:
+                _send(stdout, f"ERR:{e}")
+if __name__ == "__main__":
+    main()

build/torch-cuda/quack/activation.py CHANGED Viewed

@@ -2,18 +2,24 @@
 import math
 from typing import Tuple
 import cutlass.cute as cute
 from cutlass import Float32, Boolean, const_expr
 from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
-from . import utils as utils
 F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
 @dsl_user_op
 def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
     return Float32(
@@ -24,7 +30,6 @@ def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
             "=f,f",
             has_side_effects=False,
             is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
         )
     )
@@ -35,9 +40,9 @@ def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
         return 0.5 + 0.5 * tanh(0.5 * x)
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
 @dsl_user_op
@@ -75,7 +80,7 @@ def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return cute.arch.fmax(x, Float32(0.0)) * x
     else:
         relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
-        return utils.mul_packed_f32x2(relu_x, x)
 @dsl_user_op
@@ -98,8 +103,8 @@ def drelu_sq(
         return dx, relu_sq_out
     else:
         relu_x = relu(x)
-        relu_sq_out = utils.mul_packed_f32x2(relu_x, x)
-        dx = utils.mul_packed_f32x2((2.0, 2.0), utils.mul_packed_f32x2(dout, relu_x))
         return dx, relu_sq_out
@@ -119,14 +124,14 @@ def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
             * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
         )
     else:
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        x_tanh_z = utils.fma_packed_f32x2(tanh_z, x, x)
-        return utils.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
 @dsl_user_op
@@ -167,28 +172,28 @@ def dgelu_tanh_approx(
         return dx, gelu_out
     else:
         # Compute z = x * (c1 + c2 * x^2)
-        x_sq = utils.mul_packed_f32x2(x, x)
-        x_sq_scaled = utils.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
-        z = utils.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
-        half_tanh_z_plus_one = utils.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
-        gelu_out = utils.mul_packed_f32x2(x, half_tanh_z_plus_one)
         # Compute gradient
         # sech^2(z) = 1 - tanh^2(z)
-        sech2_z = utils.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
         # dz/dx = c1 + 3 * c2 * x^2
-        dz_dx = utils.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
         # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
-        sech2_dz_dx = utils.mul_packed_f32x2(sech2_z, dz_dx)
-        x_sech2_dz_dx = utils.mul_packed_f32x2(x, sech2_dz_dx)
-        dgelu = utils.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
-        dx = utils.mul_packed_f32x2(dout, dgelu)
         return dx, gelu_out
@@ -204,15 +209,15 @@ def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         )
     else:
         log2_e = math.log2(math.e)
-        x_log2e = utils.mul_packed_f32x2(x, (log2_e, log2_e))
         x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
-        x_exp_p1 = utils.add_packed_f32x2(x_exp, (1.0, 1.0))
         log_x_exp_p1 = (
             cute.math.log2(x_exp_p1[0], fastmath=True),
             cute.math.log2(x_exp_p1[1], fastmath=True),
         )
         ln2 = math.log(2.0)
-        softplus_x = utils.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
         use_linear_0 = Boolean(x[0] > 20.0)
         use_linear_1 = Boolean(x[1] > 20.0)
         return (
@@ -241,9 +246,9 @@ def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) ->
         # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
         return x_half * tanh(x_half) + x_half
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
-        return utils.fma_packed_f32x2(x_half, tanh_x_half, x_half)
 @dsl_user_op
@@ -251,7 +256,7 @@ def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32
     if const_expr(not isinstance(x, tuple)):
         return silu(x) * y
     else:
-        return utils.mul_packed_f32x2(silu(x), y)
 @dsl_user_op
@@ -301,20 +306,22 @@ def dswiglu(
         # Compute sigmoid(x) and silu(x)
         if const_expr(not already_halved):
             sigmoid_x = sigmoid(x)
-            silu_x = utils.mul_packed_f32x2(x, sigmoid_x)
         else:
             tanh_x = (tanh(x[0]), tanh(x[1]))
-            sigmoid_x = utils.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
-            silu_x = utils.fma_packed_f32x2(x, tanh_x, x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
         # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
-        sigmoid_x_minus_silu_x_sigmoid_x = utils.fma_packed_f32x2(
             sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
         )
-        d_silu_x_dout = utils.fma_packed_f32x2(sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout)
-        dx = utils.mul_packed_f32x2(d_silu_x_dout, y)
         dy = silu_x_dout
-        swiglu_out = utils.mul_packed_f32x2(silu_x, y)
         return dx, dy, swiglu_out
@@ -334,11 +341,11 @@ def swiglu_oai(
         silu_x = x_half * tanh(alpha * x_half) + x_half
         return silu_x * y + silu_x
     else:
-        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
-        alpha_x_half = utils.mul_packed_f32x2((alpha, alpha), x_half)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        silu_x = utils.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
-        return utils.fma_packed_f32x2(silu_x, y, silu_x)
 @dsl_user_op
@@ -370,22 +377,22 @@ def dswiglu_oai(
         return dx, dy, swiglu_out
     else:
         # Compute sigmoid(alpha * x)
-        alpha_x_half = utils.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
-        sigmoid_alpha_x = utils.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
-        silu_x = utils.mul_packed_f32x2(x, sigmoid_alpha_x)
-        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
         # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
-        silu_x_minus_product = utils.fma_packed_f32x2(
             silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
         )
-        sigmoid_plus_alpha_diff = utils.fma_packed_f32x2(
             (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
         )
-        d_silu_x_dout = utils.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
-        dx = utils.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
         dy = silu_x_dout
-        swiglu_out = utils.fma_packed_f32x2(silu_x, y, silu_x)
         return dx, dy, swiglu_out
@@ -400,7 +407,7 @@ def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
         return sigmoid_x * y  # FMUL
     else:
         sigmoid_x = sigmoid(x)
-        return utils.mul_packed_f32x2(sigmoid_x, y)
 @dsl_user_op
@@ -430,11 +437,11 @@ def dglu(
         return dx, dy, glu_out
     else:
         sigmoid_x = sigmoid(x)
-        sigmoid_x_dout = utils.mul_packed_f32x2(sigmoid_x, dout)
-        glu_out = utils.mul_packed_f32x2(sigmoid_x, y)
         # dx = (y - glu_out) * sigmoid_x_dout
-        y_minus_glu_out = utils.sub_packed_f32x2(y, glu_out)
-        dx = utils.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
         dy = sigmoid_x_dout
         return dx, dy, glu_out
@@ -448,7 +455,7 @@ def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
         return cute.arch.fmax(x, Float32(0.0)) * y
     else:
         relu_x = relu(x)
-        return utils.mul_packed_f32x2(relu_x, y)
 @dsl_user_op
@@ -475,10 +482,10 @@ def dreglu(
         x0_pos = Boolean(x[0] > 0)
         x1_pos = Boolean(x[1] > 0)
         relu_x = relu(x)
-        dout_y = utils.mul_packed_f32x2(dout, y)
         dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
-        dy = utils.mul_packed_f32x2(dout, relu_x)
-        reglu_out = utils.mul_packed_f32x2(relu_x, y)
         return dx, dy, reglu_out
@@ -491,7 +498,7 @@ def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x
     if const_expr(not isinstance(x, tuple)):
         return gelu_tanh_approx(x) * y
     else:
-        return utils.mul_packed_f32x2(gelu_tanh_approx(x), y)
 @dsl_user_op
@@ -518,7 +525,43 @@ def dgeglu(
         # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
         dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
         # Compute gradients for geglu
-        dx = utils.mul_packed_f32x2(dgelu_x_dout, y)
-        dy = utils.mul_packed_f32x2(gelu_x, dout)
-        geglu_out = utils.mul_packed_f32x2(gelu_x, y)
         return dx, dy, geglu_out

 import math
 from typing import Tuple
+from functools import partial
 import cutlass.cute as cute
 from cutlass import Float32, Boolean, const_expr
 from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm, nvvm
 F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
+sub_packed_f32x2 = partial(
+    cute.arch.calc_packed_f32x2_op,
+    src_c=None,
+    calc_func=nvvm.sub_packed_f32x2,
+)
 @dsl_user_op
 def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
     return Float32(
             "=f,f",
             has_side_effects=False,
             is_align_stack=False,
         )
     )
         # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
         return 0.5 + 0.5 * tanh(0.5 * x)
     else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return cute.arch.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
 @dsl_user_op
         return cute.arch.fmax(x, Float32(0.0)) * x
     else:
         relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
+        return cute.arch.mul_packed_f32x2(relu_x, x)
 @dsl_user_op
         return dx, relu_sq_out
     else:
         relu_x = relu(x)
+        relu_sq_out = cute.arch.mul_packed_f32x2(relu_x, x)
+        dx = cute.arch.mul_packed_f32x2((2.0, 2.0), cute.arch.mul_packed_f32x2(dout, relu_x))
         return dx, relu_sq_out
             * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
         )
     else:
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
+        x_tanh_z = cute.arch.fma_packed_f32x2(tanh_z, x, x)
+        return cute.arch.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
 @dsl_user_op
         return dx, gelu_out
     else:
         # Compute z = x * (c1 + c2 * x^2)
+        x_sq = cute.arch.mul_packed_f32x2(x, x)
+        x_sq_scaled = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
+        z = cute.arch.mul_packed_f32x2(x, x_sq_scaled)
         tanh_z = (tanh(z[0]), tanh(z[1]))
+        half_tanh_z_plus_one = cute.arch.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = cute.arch.mul_packed_f32x2(x, half_tanh_z_plus_one)
         # Compute gradient
         # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = cute.arch.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
         # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = cute.arch.fma_packed_f32x2(
             x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
         )
         # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        sech2_dz_dx = cute.arch.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = cute.arch.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = cute.arch.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        dx = cute.arch.mul_packed_f32x2(dout, dgelu)
         return dx, gelu_out
         )
     else:
         log2_e = math.log2(math.e)
+        x_log2e = cute.arch.mul_packed_f32x2(x, (log2_e, log2_e))
         x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
+        x_exp_p1 = cute.arch.add_packed_f32x2(x_exp, (1.0, 1.0))
         log_x_exp_p1 = (
             cute.math.log2(x_exp_p1[0], fastmath=True),
             cute.math.log2(x_exp_p1[1], fastmath=True),
         )
         ln2 = math.log(2.0)
+        softplus_x = cute.arch.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
         use_linear_0 = Boolean(x[0] > 20.0)
         use_linear_1 = Boolean(x[1] > 20.0)
         return (
         # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
         return x_half * tanh(x_half) + x_half
     else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
         tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return cute.arch.fma_packed_f32x2(x_half, tanh_x_half, x_half)
 @dsl_user_op
     if const_expr(not isinstance(x, tuple)):
         return silu(x) * y
     else:
+        return cute.arch.mul_packed_f32x2(silu(x), y)
 @dsl_user_op
         # Compute sigmoid(x) and silu(x)
         if const_expr(not already_halved):
             sigmoid_x = sigmoid(x)
+            silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_x)
         else:
             tanh_x = (tanh(x[0]), tanh(x[1]))
+            sigmoid_x = cute.arch.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = cute.arch.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        sigmoid_x_minus_silu_x_sigmoid_x = cute.arch.fma_packed_f32x2(
             sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
         )
+        d_silu_x_dout = cute.arch.fma_packed_f32x2(
+            sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout
+        )
+        dx = cute.arch.mul_packed_f32x2(d_silu_x_dout, y)
         dy = silu_x_dout
+        swiglu_out = cute.arch.mul_packed_f32x2(silu_x, y)
         return dx, dy, swiglu_out
         silu_x = x_half * tanh(alpha * x_half) + x_half
         return silu_x * y + silu_x
     else:
+        x_half = cute.arch.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = cute.arch.mul_packed_f32x2((alpha, alpha), x_half)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        silu_x = cute.arch.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
 @dsl_user_op
         return dx, dy, swiglu_out
     else:
         # Compute sigmoid(alpha * x)
+        alpha_x_half = cute.arch.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
         tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        sigmoid_alpha_x = cute.arch.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = cute.arch.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = cute.arch.mul_packed_f32x2(silu_x, dout)
         # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        silu_x_minus_product = cute.arch.fma_packed_f32x2(
             silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
         )
+        sigmoid_plus_alpha_diff = cute.arch.fma_packed_f32x2(
             (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
         )
+        d_silu_x_dout = cute.arch.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = cute.arch.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
         dy = silu_x_dout
+        swiglu_out = cute.arch.fma_packed_f32x2(silu_x, y, silu_x)
         return dx, dy, swiglu_out
         return sigmoid_x * y  # FMUL
     else:
         sigmoid_x = sigmoid(x)
+        return cute.arch.mul_packed_f32x2(sigmoid_x, y)
 @dsl_user_op
         return dx, dy, glu_out
     else:
         sigmoid_x = sigmoid(x)
+        sigmoid_x_dout = cute.arch.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = cute.arch.mul_packed_f32x2(sigmoid_x, y)
         # dx = (y - glu_out) * sigmoid_x_dout
+        y_minus_glu_out = sub_packed_f32x2(y, glu_out)
+        dx = cute.arch.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
         dy = sigmoid_x_dout
         return dx, dy, glu_out
         return cute.arch.fmax(x, Float32(0.0)) * y
     else:
         relu_x = relu(x)
+        return cute.arch.mul_packed_f32x2(relu_x, y)
 @dsl_user_op
         x0_pos = Boolean(x[0] > 0)
         x1_pos = Boolean(x[1] > 0)
         relu_x = relu(x)
+        dout_y = cute.arch.mul_packed_f32x2(dout, y)
         dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
+        dy = cute.arch.mul_packed_f32x2(dout, relu_x)
+        reglu_out = cute.arch.mul_packed_f32x2(relu_x, y)
         return dx, dy, reglu_out
     if const_expr(not isinstance(x, tuple)):
         return gelu_tanh_approx(x) * y
     else:
+        return cute.arch.mul_packed_f32x2(gelu_tanh_approx(x), y)
 @dsl_user_op
         # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
         dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
         # Compute gradients for geglu
+        dx = cute.arch.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = cute.arch.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = cute.arch.mul_packed_f32x2(gelu_x, y)
         return dx, dy, geglu_out
+# ============================================================================
+# Activation name -> function maps
+# ============================================================================
+act_fn_map = {
+    None: None,
+    "silu": silu,
+    "relu": relu,
+    "relu_sq": relu_sq,
+    "gelu_tanh_approx": gelu_tanh_approx,
+}
+dact_fn_map = {
+    None: None,
+    "relu": drelu,
+    "relu_sq": drelu_sq,
+    "gelu_tanh_approx": dgelu_tanh_approx,
+}
+gate_fn_map = {
+    "swiglu": swiglu,
+    "swiglu_oai": swiglu_oai,
+    "reglu": reglu,
+    "geglu": geglu,
+    "glu": glu,
+}
+dgate_fn_map = {
+    "swiglu": dswiglu,
+    "swiglu_oai": dswiglu_oai,
+    "reglu": dreglu,
+    "geglu": dgeglu,
+    "glu": dglu,
+}

build/torch-cuda/quack/autotuner.py CHANGED Viewed

@@ -25,6 +25,29 @@ PACKAGE_NAME = "quack"
 VERSION = __version__
 def get_home_dir():
     return os.getenv(f"{PACKAGE_NAME.upper()}_HOME", Path.home())
@@ -52,6 +75,22 @@ def _base32(key):
     return base64.b32encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
 class Autotuner:
     def __init__(
         self,
@@ -124,6 +163,146 @@ class Autotuner:
             return partial(triton.testing.do_bench, warmup=5, rep=25)
         return self._do_bench
     def _bench(self, *args, config, **meta):
         verbose = os.environ.get(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1"
         if verbose:
@@ -227,6 +406,8 @@ class Autotuner:
                 @torch.compiler.disable  # Don't want any tracing here
                 def benchmark():
                     bench_start = time.time()
                     timings = {
                         config: self._bench(*args, config=config, **kwargs)
@@ -316,11 +497,11 @@ class AutotuneConfig:
         return ", ".join(res)
     def __hash__(self):
-        return hash(tuple(*self.all_kwargs().items()))
     def __eq__(self, other):
-        self_tuple = tuple(*self.all_kwargs().items())
-        other_tuple = tuple(*other.all_kwargs().items())
         return self_tuple == other_tuple

 VERSION = __version__
+def _get_current_cuda_device() -> str | None:
+    """Return the physical CUDA device identifier for the current process.
+    Maps the logical ``torch.cuda.current_device()`` index through
+    ``CUDA_VISIBLE_DEVICES`` (if set) so the result is valid as a
+    standalone ``CUDA_VISIBLE_DEVICES`` value (handles integer IDs,
+    GPU UUIDs, and MIG IDs).
+    Returns ``None`` if CUDA is not initialized or the device cannot
+    be determined.
+    """
+    if not (torch.cuda.is_available() and torch.cuda.is_initialized()):
+        return None
+    logical_device = torch.cuda.current_device()
+    parent_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if parent_visible is not None:
+        visible_devices = [d.strip() for d in parent_visible.split(",")]
+        if logical_device < len(visible_devices):
+            return visible_devices[logical_device]
+        return None
+    return str(logical_device)
 def get_home_dir():
     return os.getenv(f"{PACKAGE_NAME.upper()}_HOME", Path.home())
     return base64.b32encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
+def _gpu_warmup(duration_ms=200):
+    """Saturate the GPU to reach thermal steady-state before benchmarking.
+    Without this, the first autotuning config gets artificially good numbers
+    because the GPU hasn't been power-throttled yet.
+    """
+    a = torch.randn(4096, 4096, device="cuda", dtype=torch.bfloat16)
+    torch.cuda.synchronize()
+    target = duration_ms / 1000
+    t0 = time.time()
+    while time.time() - t0 < target:
+        for _ in range(100):
+            a = a @ a
+        torch.cuda.synchronize()
 class Autotuner:
     def __init__(
         self,
             return partial(triton.testing.do_bench, warmup=5, rep=25)
         return self._do_bench
+    def _precompile(self, *args, configs, **kwargs):
+        """Pre-compile all configs in parallel subprocesses to populate .o cache.
+        cute.compile() is not thread-safe (MLIR thread-local state) and fork after
+        CUDA init causes segfaults. So we spawn persistent subprocess workers: each
+        has its own CUDA context, creates FakeTensors matching the parent's tensor
+        metadata, and compiles with COMPILE_ONLY=True. Workers stay alive to amortize
+        import overhead across multiple configs. The parent then loads instantly from
+        the .o cache during benchmarking.
+        """
+        from .cache_utils import CACHE_ENABLED
+        if not CACHE_ENABLED:
+            return
+        max_workers = min(len(configs), int(os.getenv("QUACK_COMPILE_WORKERS", "8")))
+        if max_workers <= 1:
+            return
+        # Quick check: compile first config in-process. If it loads from .o cache
+        # (<0.5s), the rest are likely cached too — skip spawning workers.
+        t_check = time.time()
+        try:
+            current = dict(kwargs, **configs[0].all_kwargs())
+            self.fn(*args, **current)
+        except Exception:
+            pass
+        if time.time() - t_check < 0.5:
+            return
+        verbose = os.getenv(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1"
+        if verbose:
+            print(f"Pre-compiling {len(configs)} configs with {max_workers} workers")
+        t0 = time.time()
+        import pickle
+        import struct
+        import subprocess
+        import sys
+        def _send(stream, msg):
+            data = pickle.dumps(msg)
+            stream.write(struct.pack("<I", len(data)))
+            stream.write(data)
+            stream.flush()
+        def _recv(stream):
+            header = stream.read(4)
+            if len(header) < 4:
+                return None
+            length = struct.unpack("<I", header)[0]
+            return pickle.loads(stream.read(length)) if length else None
+        # Serialize tensor metadata
+        tensor_meta = []
+        for arg in args:
+            if isinstance(arg, Tensor):
+                tensor_meta.append(
+                    {
+                        "shape": list(arg.shape),
+                        "stride": list(arg.stride()),
+                        "dtype": str(arg.dtype),
+                    }
+                )
+            else:
+                tensor_meta.append(arg)
+        fn_module = self.fn.__module__
+        fn_qualname = self.fn.__qualname__
+        # Restrict worker subprocesses to the parent's current CUDA device.
+        # Without this, all workers default to cuda:0 and their CUDA context
+        # initialization can OOM when many ranks share a node.
+        worker_env = os.environ.copy()
+        current_device = _get_current_cuda_device()
+        if current_device is not None:
+            worker_env["CUDA_VISIBLE_DEVICES"] = current_device
+        # Launch persistent worker pool. When vendored under sonic_moe (loaded
+        # via kernels.get_kernel), the quack package isn't importable as a
+        # top-level module, so invoke the worker via its fully-qualified dotted
+        # path and inject PYTHONPATH so the subprocess can import it.
+        worker_module = __package__ + "._compile_worker" if __package__ else "quack._compile_worker"
+        if __package__:
+            import importlib.util
+            spec = importlib.util.find_spec(__package__.split(".")[0])
+            if spec is not None and spec.submodule_search_locations:
+                pkg_parent = os.path.dirname(list(spec.submodule_search_locations)[0])
+                existing_pp = worker_env.get("PYTHONPATH", "")
+                worker_env["PYTHONPATH"] = (
+                    f"{pkg_parent}{os.pathsep}{existing_pp}" if existing_pp else pkg_parent
+                )
+        workers = []
+        for _ in range(max_workers):
+            p = subprocess.Popen(
+                [sys.executable, "-m", worker_module],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL if not verbose else None,
+                env=worker_env,
+            )
+            ready = _recv(p.stdout)
+            if ready != "READY":
+                p.kill()
+                continue
+            workers.append(p)
+        if not workers:
+            return
+        # Round-robin dispatch configs to workers
+        pending = [0] * len(workers)
+        for i, config in enumerate(configs):
+            w = workers[i % len(workers)]
+            _send(
+                w.stdin,
+                {
+                    "fn_module": fn_module,
+                    "fn_qualname": fn_qualname,
+                    "tensor_meta": tensor_meta,
+                    "kwargs": kwargs,
+                    "config_kwargs": config.all_kwargs(),
+                },
+            )
+            pending[i % len(workers)] += 1
+        # Collect all results
+        for wi, w in enumerate(workers):
+            for _ in range(pending[wi]):
+                _recv(w.stdout)
+        # Shutdown workers (close stdin → worker exits)
+        for w in workers:
+            w.stdin.close()
+            w.wait()
+        if verbose:
+            print(f"Pre-compilation done in {time.time() - t0:.1f}s")
     def _bench(self, *args, config, **meta):
         verbose = os.environ.get(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1"
         if verbose:
                 @torch.compiler.disable  # Don't want any tracing here
                 def benchmark():
+                    self._precompile(*args, configs=pruned_configs, **kwargs)
+                    _gpu_warmup()
                     bench_start = time.time()
                     timings = {
                         config: self._bench(*args, config=config, **kwargs)
         return ", ".join(res)
     def __hash__(self):
+        return hash(tuple(self.all_kwargs().items()))
     def __eq__(self, other):
+        self_tuple = tuple(self.all_kwargs().items())
+        other_tuple = tuple(other.all_kwargs().items())
         return self_tuple == other_tuple

build/torch-cuda/quack/blockscaled_gemm_utils.py ADDED Viewed

	@@ -0,0 +1,752 @@

+# Copyright (c) 2026, Tri Dao.
+import itertools
+from functools import partial
+from typing import Callable, Optional, Type, Tuple
+import torch
+import cutlass
+import cutlass.cute as cute
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from .gemm_default_epi import GemmDefaultSm100
+from .gemm_tvm_ffi_utils import div_for_dtype, make_scheduler_args
+from .mx_utils import (
+    to_mx_compiled,
+    to_mxfp4_compiled,
+    to_nvfp4_compiled,
+)
+from .varlen_utils import VarlenArguments
+TORCH_DTYPE_MAP = {
+    cutlass.Float4E2M1FN: torch.float4_e2m1fn_x2,
+    cutlass.Float16: torch.float16,
+    cutlass.BFloat16: torch.bfloat16,
+    cutlass.Float32: torch.float32,
+    cutlass.Float8E4M3FN: torch.float8_e4m3fn,
+    cutlass.Float8E5M2: torch.float8_e5m2,
+    cutlass.Float8E8M0FNU: torch.float8_e8m0fnu,
+}
+FLOAT8_DTYPES = {
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e8m0fnu,
+}
+FP4_E2M1FN_VALUES = (
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    -0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+)
+def ceil_div(a: int, b: int) -> int:
+    return (a + b - 1) // b
+def torch_dtype_for_cutlass(dtype: Type[cutlass.Numeric]) -> torch.dtype:
+    if dtype not in TORCH_DTYPE_MAP:
+        raise TypeError(f"Unsupported dtype: {dtype}")
+    return TORCH_DTYPE_MAP[dtype]
+def _make_fake_tensor_like(tensor: torch.Tensor, dtype: Type[cutlass.Numeric]) -> cute.Tensor:
+    return cute.runtime.make_fake_tensor(
+        dtype,
+        tensor.shape,
+        stride=tensor.stride(),
+        assumed_align=16,
+    )
+def _leading_dim_from_stride(tensor: torch.Tensor) -> int:
+    for i, stride in enumerate(tensor.stride()):
+        if stride == 1:
+            return i
+    raise ValueError(
+        f"Tensor has no unit stride dimension: shape={tensor.shape}, stride={tensor.stride()}"
+    )
+def _make_compile_tensor_like(
+    tensor: torch.Tensor, dtype: Type[cutlass.Numeric], dynamic_layout: bool = False
+) -> cute.Tensor:
+    compile_tensor = cute.runtime.from_dlpack(tensor)
+    compile_tensor.element_type = dtype
+    if dynamic_layout:
+        marked = compile_tensor.mark_layout_dynamic(leading_dim=_leading_dim_from_stride(tensor))
+        if marked is not None:
+            compile_tensor = marked
+    return compile_tensor
+def _make_fake_compact_tensor(
+    shape: Tuple[int, ...], dtype: Type[cutlass.Numeric], leading_dim: int
+) -> cute.Tensor:
+    logical_shape = list(shape)
+    if dtype == cutlass.Float4E2M1FN:
+        logical_shape[leading_dim] *= 2
+    return fake_tensor(
+        dtype,
+        tuple(logical_shape),
+        leading_dim=leading_dim,
+        divisibility=div_for_dtype(dtype),
+    )
+def _fp4_e2m1fn_value_table(device: torch.device) -> torch.Tensor:
+    return torch.tensor(FP4_E2M1FN_VALUES, dtype=torch.float32, device=device)
+def _pack_fp4_e2m1fn_codes(codes: torch.Tensor) -> torch.Tensor:
+    """Pack logical FP4 codes into torch.float4_e2m1fn_x2 storage."""
+    if codes.dtype != torch.uint8:
+        raise TypeError(f"Expected uint8 FP4 codes, got {codes.dtype}")
+    packed_shape = (codes.shape[0], ceil_div(codes.shape[1], 2), codes.shape[2])
+    packed = torch.empty(packed_shape, dtype=torch.float4_e2m1fn_x2, device=codes.device)
+    packed_u8 = packed.view(torch.uint8)
+    low = codes[:, 0::2, :]
+    high = torch.zeros_like(low)
+    high[:, : codes[:, 1::2, :].shape[1], :] = codes[:, 1::2, :]
+    packed_u8.copy_(low | (high << 4))
+    return packed
+def _create_fp4_operand_tensor(
+    l: int,
+    mode0: int,
+    mode1: int,
+    is_mode0_major: bool,
+    *,
+    init: str,
+) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+    if is_mode0_major:
+        raise ValueError("Float4E2M1FN blockscaled operands must be K-major")
+    tensor = torch.empty(
+        (mode0, ceil_div(mode1, 2), l), dtype=torch.float4_e2m1fn_x2, device="cuda"
+    )
+    tensor.view(torch.uint8).zero_()
+    if init == "empty":
+        return None, tensor
+    if init != "normal":
+        raise ValueError(f"Unsupported init: {init}")
+    magnitudes = torch.randint(0, 8, (mode0, mode1, l), device="cuda", dtype=torch.uint8)
+    signs = torch.randint(0, 2, (mode0, mode1, l), device="cuda", dtype=torch.uint8)
+    signs = torch.where(magnitudes == 0, torch.zeros_like(signs), signs << 3)
+    codes = magnitudes | signs
+    tensor.copy_(_pack_fp4_e2m1fn_codes(codes))
+    ref = _fp4_e2m1fn_value_table(tensor.device)[codes.long()]
+    return ref, tensor
+def create_blockscaled_operand_tensor(
+    l: int,
+    mode0: int,
+    mode1: int,
+    is_mode0_major: bool,
+    dtype: Type[cutlass.Numeric],
+    *,
+    init: str = "normal",
+) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+    if dtype == cutlass.Float4E2M1FN:
+        return _create_fp4_operand_tensor(l, mode0, mode1, is_mode0_major, init=init)
+    shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+    permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+    torch_dtype = torch_dtype_for_cutlass(dtype)
+    gen_dtype = torch.bfloat16 if torch_dtype in FLOAT8_DTYPES else torch_dtype
+    tensor = torch.empty(shape, dtype=gen_dtype, device="cuda")
+    if init == "normal":
+        tensor.normal_(std=mode1 ** (-0.5))
+    elif init != "empty":
+        raise ValueError(f"Unsupported init: {init}")
+    # Do NOT .contiguous() after .permute() — that would re-materialize with wrong
+    # strides (L innermost) and break K-majorness / N-majorness for l > 1.
+    # The original (l, mode0/1, mode1/0) is contiguous, and the permuted view has
+    # the correct per-mode strides: stride=1 on the intended contiguous dim.
+    tensor = tensor.to(torch_dtype).permute(permute_order)
+    ref = tensor.float() if init != "empty" else None
+    return ref, tensor
+def _pack_blockscaled_scales(ref_blocks: torch.Tensor) -> torch.Tensor:
+    """Rearrange (mn, sf_k, l) scales into the (l, rm, rk, 512) blocked layout."""
+    mn, sf_k, l = ref_blocks.shape
+    rm = ceil_div(mn, 128)
+    rk = ceil_div(sf_k, 4)
+    packed_6d = torch.zeros((l, rm, rk, 32, 4, 4), dtype=torch.float32, device=ref_blocks.device)
+    packed_view = packed_6d.permute(3, 4, 1, 5, 2, 0)  # (32, 4, rm, 4, rk, l)
+    m_idx = torch.arange(mn, device=ref_blocks.device)
+    k_idx = torch.arange(sf_k, device=ref_blocks.device)
+    l_idx = torch.arange(l, device=ref_blocks.device)
+    packed_view[
+        m_idx[:, None, None] % 32,
+        (m_idx[:, None, None] // 32) % 4,
+        m_idx[:, None, None] // 128,
+        k_idx[None, :, None] % 4,
+        k_idx[None, :, None] // 4,
+        l_idx[None, None, :],
+    ] = ref_blocks
+    return packed_6d.view(l, rm, rk, 512)
+def create_blockscaled_scale_tensor(
+    l: int,
+    mn: int,
+    k: int,
+    sf_vec_size: int,
+    dtype: Type[cutlass.Numeric],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    sf_k = ceil_div(k, sf_vec_size)
+    if dtype == cutlass.Float8E8M0FNU:
+        exponents = torch.randint(0, 2, (mn, sf_k, l), device="cuda", dtype=torch.int32)
+        ref_blocks = torch.pow(2.0, exponents.float())
+    else:
+        ref_blocks = torch.randint(1, 4, (mn, sf_k, l), device="cuda", dtype=torch.int32).float()
+    packed_f32 = _pack_blockscaled_scales(ref_blocks)
+    packed = torch.empty_like(packed_f32, dtype=torch_dtype_for_cutlass(dtype))
+    packed.copy_(packed_f32)
+    ref = (
+        ref_blocks.permute(2, 0, 1)
+        .unsqueeze(-1)
+        .expand(l, mn, sf_k, sf_vec_size)
+        .reshape(l, mn, sf_k * sf_vec_size)
+        .permute(1, 2, 0)
+    )[:, :k, :]
+    return ref, packed
+def pack_scale_2d_to_blocked_contig(scale_2d: torch.Tensor) -> torch.Tensor:
+    """Rearrange a (l, mn, sf_k) or (mn, sf_k) e8m0 scale tensor into the
+    contiguous (l, rm, rk, 512) blocked layout shared by the quack kernel and
+    cuBLAS's block-scaling. Each 512 B inner block holds one 128 MN × 4 K
+    swizzled tile. Pads `mn` to a multiple of 128 and `sf_k` to a multiple of
+    4 with zeros."""
+    if scale_2d.dim() == 2:
+        scale_2d = scale_2d.unsqueeze(0)
+    assert scale_2d.dim() == 3, f"expected (l, mn, sf_k), got shape {tuple(scale_2d.shape)}"
+    orig_dtype = scale_2d.dtype
+    l, mn, sf_k = scale_2d.shape
+    rm = ceil_div(mn, 128)
+    rk = ceil_div(sf_k, 4)
+    mn_pad = rm * 128
+    sf_k_pad = rk * 4
+    u8 = scale_2d.contiguous().view(torch.uint8)
+    if mn_pad != mn or sf_k_pad != sf_k:
+        padded = torch.zeros(l, mn_pad, sf_k_pad, device=scale_2d.device, dtype=torch.uint8)
+        padded[:, :mn, :sf_k] = u8
+    else:
+        padded = u8
+    # (l, mn_pad, sf_k_pad) -> (l, rm, 128, rk, 4) -> (l, rm, rk, 128, 4)
+    blocks = padded.view(l, rm, 128, rk, 4).permute(0, 1, 3, 2, 4)
+    # split 128 into (4 outer, 32 inner), then swap to (32, 4)
+    blocks = blocks.reshape(l, rm, rk, 4, 32, 4).transpose(3, 4).contiguous()
+    return blocks.view(l, rm, rk, 512).view(orig_dtype)
+def scale_view_for_kernel(scale_contig: torch.Tensor, mn: int, sf_k: int, l: int) -> torch.Tensor:
+    """Validate a (l, rm, rk, 512) scale tensor and return it unchanged.
+    Only the innermost 512-B tile must be contiguous (stride 1, size 512);
+    outer (L, rm, rk) strides are free — the kernel reads them from the
+    passed tensor. This lets callers pass a slice/view of a larger buffer
+    with no extra copy. Works for both E8M0 (MX) and E4M3 (NVFP4)."""
+    rm = ceil_div(mn, 128)
+    rk = ceil_div(sf_k, 4)
+    assert scale_contig.shape == (l, rm, rk, 512), (
+        f"expected (l, rm, rk, 512) = ({l}, {rm}, {rk}, 512), got {tuple(scale_contig.shape)}"
+    )
+    assert scale_contig.stride(-1) == 1, (
+        f"innermost 512-B dim must be unit-stride, got stride {scale_contig.stride(-1)}"
+    )
+    return scale_contig
+def scale_blocked_for_cublas(
+    scale_contig: torch.Tensor, mn: int, sf_k: int, l_idx: int = 0
+) -> torch.Tensor:
+    """Flatten a (l, rm, rk, 512) scale tensor to the 1D swizzled layout
+    torch._scaled_mm expects. Uses a single l slice."""
+    assert scale_contig.is_contiguous() and scale_contig.dim() == 4
+    return scale_contig[l_idx].reshape(-1)
+_FP4_E2M1_CODE_TO_VALUE = torch.tensor(FP4_E2M1FN_VALUES, dtype=torch.float32)
+def _fp4_unpacked_to_value(codes_u8: torch.Tensor) -> torch.Tensor:
+    """Convert FP4 E2M1 codes in [0,16) to signed float values via table lookup.
+    Code layout: bit 3 = sign, bits 0-2 = magnitude index into {0,.5,1,1.5,2,3,4,6}."""
+    table = _FP4_E2M1_CODE_TO_VALUE.to(codes_u8.device)
+    return table[codes_u8.long()]
+def _blockscaled_format_of(ab_dtype, sf_dtype, sf_vec_size) -> str:
+    """Identify which blockscaled format the (ab, sf, vec) tuple corresponds to."""
+    if ab_dtype == cutlass.Float8E4M3FN and sf_dtype == cutlass.Float8E8M0FNU and sf_vec_size == 32:
+        return "mxfp8"
+    if ab_dtype == cutlass.Float4E2M1FN and sf_dtype == cutlass.Float8E8M0FNU and sf_vec_size == 32:
+        return "mxfp4"
+    if ab_dtype == cutlass.Float4E2M1FN and sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 16:
+        return "nvfp4"
+    raise ValueError(
+        f"init=quant does not support (ab={ab_dtype}, sf={sf_dtype}, vec={sf_vec_size}). "
+        f"Supported: MXFP8 (e4m3+e8m0+32), MXFP4 (e2m1+e8m0+32), NVFP4 (e2m1+e4m3+16)."
+    )
+def create_blockscaled_operand_quantized(
+    l: int,
+    mn: int,
+    k: int,
+    is_mn_major: bool,
+    sf_vec_size: int = 32,
+    ab_dtype: Type[cutlass.Numeric] = cutlass.Float8E4M3FN,
+    sf_dtype: Type[cutlass.Numeric] = cutlass.Float8E8M0FNU,
+    *,
+    randn_std: Optional[float] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Generate bf16 randn, quantize to MXFP8/MXFP4/NVFP4 and produce:
+    ref:   (mn, k, l) float32 dequantized reference
+    q_mkl: (mn, k, l) operand tensor in the layout the quack kernel consumes
+           (float8_e4m3fn for fp8 formats; int8 with packed nibbles for fp4)
+    scale_contig: (l, rm, rk, 512) contiguous scale storage. Each 512 B
+           inner block is one 128 MN × 4 K swizzled tile. Byte layout matches
+           cuBLAS `to_blocked`. Pass directly to the quack kernel, or use
+           `scale_blocked_for_cublas` for cuBLAS.
+    """
+    fmt = _blockscaled_format_of(ab_dtype, sf_dtype, sf_vec_size)
+    if is_mn_major and fmt != "mxfp8":
+        raise NotImplementedError(
+            f"is_mn_major=True is only supported for MXFP8 (tcgen05 MMA requires "
+            f"K-major for MXFP4/NVFP4 operands); got fmt={fmt}"
+        )
+    assert k % sf_vec_size == 0, f"k ({k}) must be divisible by sf_vec_size ({sf_vec_size})"
+    sf_k = k // sf_vec_size
+    std = randn_std if randn_std is not None else k**-0.5
+    x_hp = (torch.randn(l, mn, k, dtype=torch.bfloat16, device="cuda") * std).contiguous()
+    x_flat = x_hp.view(l * mn, k)
+    if fmt == "mxfp8":
+        q_flat, scale_2d = to_mx_compiled(x_flat, sf_vec_size)  # (l*mn, k), (l*mn, sf_k)
+        if is_mn_major:
+            # Operand: (mn, k, l) MN-major. Start from (l, mn, k) contig, transpose
+            # to (l, k, mn) contig, then permute to (mn, k, l) with strides (1, mn, mn*k).
+            q_mkl = (
+                q_flat.view(l, mn, k).transpose(1, 2).contiguous().permute(2, 1, 0)
+            )  # strides (1, mn, mn*k)
+        else:
+            # Operand: (mn, k, l) K-major VIEW of contiguous (l, mn, k).
+            # Do NOT call .contiguous() here — that would materialize as (mn, k, l) row-major,
+            # making L the innermost stride=1 dim and BREAKING K-majorness for l > 1.
+            q_mkl = q_flat.view(l, mn, k).contiguous().permute(1, 2, 0)  # strides (k, 1, mn*k)
+        q_vals = q_flat.float().view(l, mn, k)
+        scale_vals = scale_2d.float().view(l, mn, sf_k).repeat_interleave(sf_vec_size, dim=-1)
+        ref_mkl = (q_vals * scale_vals).permute(1, 2, 0).contiguous()
+        scale_2d = scale_2d.view(l, mn, sf_k)
+    elif fmt in ("mxfp4", "nvfp4"):
+        if fmt == "mxfp4":
+            q_packed, scale_2d = to_mxfp4_compiled(x_flat, sf_vec_size)  # (l*mn, k/2), (l*mn, sf_k)
+        else:
+            q_packed, scale_2d, _pts = to_nvfp4_compiled(x_flat, sf_vec_size, None)
+        # q_packed is uint8, two 4-bit codes per byte (low nibble=even K, high=odd K).
+        # Decode for ref: code -> {0,.5,1,1.5,2,3,4,6,-0,-.5,...} via lookup.
+        codes_lo = (q_packed & 0x0F).view(l, mn, k // 2)
+        codes_hi = ((q_packed >> 4) & 0x0F).view(l, mn, k // 2)
+        vals_lo = _fp4_unpacked_to_value(codes_lo)  # (l, mn, k/2)
+        vals_hi = _fp4_unpacked_to_value(codes_hi)
+        q_values = torch.stack([vals_lo, vals_hi], dim=-1).reshape(l, mn, k)  # interleave back
+        scale_vals = scale_2d.float().view(l, mn, sf_k).repeat_interleave(sf_vec_size, dim=-1)
+        ref_mkl = (q_values * scale_vals).permute(1, 2, 0).contiguous()
+        # Kernel operand: (mn, k/2, l) K-major view (no post-contiguous!)
+        q_mkl = (
+            q_packed.view(l, mn, k // 2).contiguous().permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
+        )
+        scale_2d = scale_2d.view(l, mn, sf_k)
+    scale_contig = pack_scale_2d_to_blocked_contig(scale_2d)
+    return ref_mkl, q_mkl, scale_contig
+def create_blockscaled_varlen_m_operands(
+    num_experts: int,
+    m_per: int,
+    n: int,
+    k: int,
+    sf_vec_size: int,
+    ab_dtype: Type[cutlass.Numeric] = cutlass.Float8E4M3FN,
+    sf_dtype: Type[cutlass.Numeric] = cutlass.Float8E8M0FNU,
+    *,
+    randn_std: Optional[float] = None,
+    seqlens_m: Optional[list] = None,
+    b_major: str = "k",
+):
+    """Generate bf16 randn + quantize for a varlen_m blockscaled GEMM.
+    Per-expert seqlens may be arbitrary (not required to be multiples of 128).
+    SF is stored in dQaccum-style padded format: each expert `i`'s scales
+    occupy `ceildiv(m_i, 128) * 128` rows at offset
+    `(cu_seqlens_m[i] + i * 128) // 128 * 128` in the padded scale buffer.
+    The kernel decodes via `VarlenManager.offset_batch_SFA` which applies the
+    same formula.
+    Returns (a_ref, b_ref, qa, qb, a_sc_contig, b_sc_contig, cu_seqlens_m):
+      a_ref: (total_m, k) fp32 dequantized
+      b_ref: (num_experts, n, k) fp32 dequantized
+      qa:   (total_m, k) 2D K-major quantized operand (fp8) or (total_m, k/2) (fp4)
+      qb:   (n, k, num_experts) 3D K-major quantized operand (fp8) or (n, k/2, num_experts) (fp4)
+      a_sc_contig: (1, total_padded_rm, rk, 512) — dQaccum-padded SFA.
+        total_padded_rm = ((total_m + num_experts * 128) // 128).
+      b_sc_contig: (num_experts, rn, rk, 512) — regular per-expert SFB.
+      cu_seqlens_m: (num_experts+1,) int32
+    """
+    assert k % sf_vec_size == 0
+    if seqlens_m is None:
+        seqlens_m = [m_per] * num_experts
+    assert len(seqlens_m) == num_experts, (
+        f"seqlens_m length {len(seqlens_m)} != num_experts {num_experts}"
+    )
+    total_m = int(sum(seqlens_m))
+    std = randn_std if randn_std is not None else k**-0.5
+    sf_k = k // sf_vec_size
+    if ab_dtype == cutlass.Float8E4M3FN and sf_dtype == cutlass.Float8E8M0FNU and sf_vec_size == 32:
+        from .mx_utils import to_mx_compiled
+        to_fn = to_mx_compiled
+    else:
+        raise NotImplementedError(
+            f"varlen_m currently only supports MXFP8 (got ab={ab_dtype}, sf={sf_dtype}, vec={sf_vec_size}). "
+            "FP4 support pending."
+        )
+    # Quantize A: (total_m, k) bf16 -> (total_m, k) fp8 K-major.
+    # A data itself is stored packed (no per-expert padding); only SFA is padded.
+    a_hp = (torch.randn(total_m, k, dtype=torch.bfloat16, device="cuda") * std).contiguous()
+    qa, sa_2d = to_fn(a_hp, sf_vec_size)  # (total_m, k), (total_m, sf_k)
+    a_ref = qa.float() * sa_2d.float().repeat_interleave(sf_vec_size, dim=-1)
+    # Build padded SFA storage (dQaccum format). Each expert's m_i rows of
+    # scales are written at padded tile offset `cu_seqlens[i] // 128 + i`.
+    # Allocation: `ceildiv(total_m, 128) + (L - 1)` tiles — proven sufficient
+    # in AI/varlen_blockscaled_sf_layout.md (proof 2's "tighter alternative").
+    # Matches `total_m // 128 + L` when total_m % 128 > 0; 1 tile smaller
+    # when total_m is an exact multiple of 128.
+    tile = 128
+    total_padded_rm = (total_m + tile - 1) // tile + (num_experts - 1)
+    total_padded_m = total_padded_rm * tile
+    sa_2d_padded = torch.zeros(total_padded_m, sf_k, dtype=sa_2d.dtype, device=sa_2d.device)
+    offset = 0
+    for i, m_i in enumerate(seqlens_m):
+        offset_padded = (offset // tile + i) * tile
+        sa_2d_padded[offset_padded : offset_padded + m_i] = sa_2d[offset : offset + m_i]
+        offset += m_i
+    a_sc_contig = pack_scale_2d_to_blocked_contig(sa_2d_padded.view(1, total_padded_m, sf_k))
+    # Quantize B: (num_experts, n, k) bf16 -> (n, k, num_experts). b_major selects
+    # k-major (stride (k, 1, n*k)) or n-major (stride (1, n, n*k)).
+    assert b_major in ("k", "n"), f"b_major must be 'k' or 'n', got {b_major!r}"
+    b_hp = (torch.randn(num_experts, n, k, dtype=torch.bfloat16, device="cuda") * std).contiguous()
+    qb_flat, sb_2d = to_fn(b_hp.view(num_experts * n, k), sf_vec_size)
+    if b_major == "k":
+        qb = (
+            qb_flat.view(num_experts, n, k).contiguous().permute(1, 2, 0)
+        )  # (n, k, l) stride (k, 1, n*k)
+    else:
+        qb = (
+            qb_flat.view(num_experts, n, k).transpose(1, 2).contiguous().permute(2, 1, 0)
+        )  # (n, k, l) stride (1, n, n*k)
+    sb_2d = sb_2d.view(num_experts, n, sf_k)
+    b_sc_contig = pack_scale_2d_to_blocked_contig(sb_2d)
+    b_ref = qb_flat.float().view(num_experts, n, k) * sb_2d.float().repeat_interleave(
+        sf_vec_size, dim=-1
+    )
+    cu_seqlens_m = torch.tensor(
+        [0] + list(itertools.accumulate(seqlens_m)), dtype=torch.int32, device="cuda"
+    )
+    return a_ref, b_ref, qa, qb, a_sc_contig, b_sc_contig, cu_seqlens_m
+def create_blockscaled_varlen_k_operands(
+    num_experts: int,
+    k_per: int,
+    m: int,
+    n: int,
+    sf_vec_size: int,
+    ab_dtype: Type[cutlass.Numeric] = cutlass.Float8E4M3FN,
+    sf_dtype: Type[cutlass.Numeric] = cutlass.Float8E8M0FNU,
+    *,
+    randn_std: Optional[float] = None,
+    seqlens_k: Optional[list] = None,
+):
+    """Generate bf16 randn + quantize for a varlen_k blockscaled GEMM.
+    Per-expert `k_i` must be a multiple of `sf_vec_size` (quantization chunk)
+    but NOT necessarily a multiple of `sf_vec_size * 4` (= 128 for MXFP8).
+    The SF buffer uses dQaccum-style K padding: each expert `i`'s scales occupy
+    `ceildiv(k_i, 128) * 128` bytes worth of K at offset
+    `(cu_seqlens_k[i] + i * 128) // 128 * 128` (in source-K units). A and B
+    operand data stay packed and unpadded along K — only their SF buffers pad.
+    Returns (a_ref_list, b_ref_list, qa, qb, a_sc_contig, b_sc_contig, cu_seqlens_k):
+      a_ref_list: list of per-expert (m, k_i) fp32 dequantized A.
+      b_ref_list: list of per-expert (n, k_i) fp32 dequantized B.
+      qa:  (m, total_k) K-major fp8 (stride (total_k, 1)).
+      qb:  (n, total_k) K-major fp8 (stride (total_k, 1)).
+      a_sc_contig: (1, rm, total_padded_rk, 512) dQaccum-padded SFA.
+      b_sc_contig: (1, rn, total_padded_rk, 512) dQaccum-padded SFB.
+      cu_seqlens_k: (num_experts+1,) int32.
+    """
+    if not (
+        ab_dtype == cutlass.Float8E4M3FN and sf_dtype == cutlass.Float8E8M0FNU and sf_vec_size == 32
+    ):
+        raise NotImplementedError(
+            f"varlen_k currently only supports MXFP8 (got ab={ab_dtype}, sf={sf_dtype}, "
+            f"vec={sf_vec_size}). FP4 is k-major-only and not wired up."
+        )
+    if seqlens_k is None:
+        seqlens_k = [k_per] * num_experts
+    assert len(seqlens_k) == num_experts, (
+        f"seqlens_k length {len(seqlens_k)} != num_experts {num_experts}"
+    )
+    for i, k_i in enumerate(seqlens_k):
+        assert k_i % sf_vec_size == 0, (
+            f"seqlens_k[{i}]={k_i} must be divisible by sf_vec_size={sf_vec_size}"
+        )
+    total_k = int(sum(seqlens_k))
+    std = randn_std if randn_std is not None else (max(seqlens_k)) ** -0.5
+    sf_k_total = total_k // sf_vec_size
+    from .mx_utils import to_mx_compiled
+    a_q_list, a_sc_list, a_ref_list = [], [], []
+    b_q_list, b_sc_list, b_ref_list = [], [], []
+    for k_i in seqlens_k:
+        # A slice: (m, k_i) bf16 -> fp8, scales (m, k_i // sf_vec_size).
+        a_hp = (torch.randn(m, k_i, dtype=torch.bfloat16, device="cuda") * std).contiguous()
+        a_q, a_sc = to_mx_compiled(a_hp, sf_vec_size)
+        a_q_list.append(a_q)
+        a_sc_list.append(a_sc)
+        a_ref_list.append(a_q.float() * a_sc.float().repeat_interleave(sf_vec_size, dim=-1))
+        b_hp = (torch.randn(n, k_i, dtype=torch.bfloat16, device="cuda") * std).contiguous()
+        b_q, b_sc = to_mx_compiled(b_hp, sf_vec_size)
+        b_q_list.append(b_q)
+        b_sc_list.append(b_sc)
+        b_ref_list.append(b_q.float() * b_sc.float().repeat_interleave(sf_vec_size, dim=-1))
+    # Pack operand data along K: (m, total_k), (n, total_k). varlen_k's
+    # ragged TMA descriptors are built for MN-major operands (stride 1 on
+    # M/N), so store M-major A and N-major B.
+    # cat gives K-major; transpose → contiguous → transpose to get M-major.
+    qa = torch.cat(a_q_list, dim=1).t().contiguous().t()  # (m, total_k) stride (1, m)
+    qb = torch.cat(b_q_list, dim=1).t().contiguous().t()  # (n, total_k) stride (1, n)
+    assert qa.stride() == (1, qa.shape[0])
+    assert qb.stride() == (1, qb.shape[0])
+    # Pad SFA/SFB per-expert to multiples of 128 source-K (= 4 scales).
+    # offset_tile = cu_seqlens[i] // 128 + i (same formula the kernel uses).
+    # Allocation = ceildiv(total_k, 128) + (L - 1) tiles (tighter than
+    # total_k//128 + L when total_k is a multiple of 128; same otherwise).
+    tile = 128  # sf_vec_size * 4
+    total_padded_rk = (total_k + tile - 1) // tile + (num_experts - 1)
+    total_padded_k = total_padded_rk * tile
+    total_padded_sf_k = total_padded_k // sf_vec_size
+    sa_2d_padded = torch.zeros(m, total_padded_sf_k, dtype=a_sc_list[0].dtype, device="cuda")
+    sb_2d_padded = torch.zeros(n, total_padded_sf_k, dtype=b_sc_list[0].dtype, device="cuda")
+    k_offset = 0
+    for i, k_i in enumerate(seqlens_k):
+        sf_k_i = k_i // sf_vec_size
+        k_offset_padded = (k_offset // tile + i) * tile
+        sf_k_offset_padded = k_offset_padded // sf_vec_size
+        sa_2d_padded[:, sf_k_offset_padded : sf_k_offset_padded + sf_k_i] = a_sc_list[i]
+        sb_2d_padded[:, sf_k_offset_padded : sf_k_offset_padded + sf_k_i] = b_sc_list[i]
+        k_offset += k_i
+    a_sc_contig = pack_scale_2d_to_blocked_contig(sa_2d_padded.view(1, m, total_padded_sf_k))
+    b_sc_contig = pack_scale_2d_to_blocked_contig(sb_2d_padded.view(1, n, total_padded_sf_k))
+    cu_seqlens_k = torch.tensor(
+        [0] + list(itertools.accumulate(seqlens_k)), dtype=torch.int32, device="cuda"
+    )
+    return a_ref_list, b_ref_list, qa, qb, a_sc_contig, b_sc_contig, cu_seqlens_k
+def compile_blockscaled_gemm_tvm_ffi(
+    ab_dtype: Type[cutlass.Numeric],
+    sf_dtype: Type[cutlass.Numeric],
+    sf_vec_size: int,
+    d_dtype: Type[cutlass.Numeric],
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    mA: torch.Tensor,
+    mB: torch.Tensor,
+    mD: torch.Tensor,
+    mSFA: torch.Tensor,
+    mSFB: torch.Tensor,
+    *,
+    use_clc_persistence: bool = True,
+    varlen_m: bool = False,
+    varlen_k: bool = False,
+) -> Callable:
+    """Compile the SM100 blockscaled GEMM.
+    When varlen_m: mA is (total_m, k) K-major, mD is (total_m, n) N-major,
+    mB is (n, k, l); run(...) takes an extra cu_seqlens_m tensor.
+    When varlen_k: mA is (m, total_k), mB is (n, total_k), mD is (m, n, l);
+    run(...) takes an extra cu_seqlens_k tensor.
+    """
+    device_capacity = get_device_capacity(mA.device)
+    if device_capacity[0] not in (10, 11):
+        raise RuntimeError("Blockscaled SM100 GEMM requires SM100/SM110")
+    assert not (varlen_m and varlen_k), "Only one of varlen_m / varlen_k"
+    gemm = partial(
+        GemmDefaultSm100,
+        sf_vec_size=sf_vec_size,
+        use_clc_persistence=use_clc_persistence,
+    )(cutlass.Float32, ab_dtype, mma_tiler_mn, (*cluster_shape_mn, 1))
+    compile_epi_args = gemm.EpilogueArguments()
+    scheduler_args = make_scheduler_args(
+        get_max_active_clusters(cluster_shape_mn[0] * cluster_shape_mn[1]),
+        max_swizzle_size=8,
+        tile_count_semaphore=None,
+        batch_idx_permute=None,
+    )
+    stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
+    from .gemm_tvm_ffi_utils import make_fake_varlen_args
+    varlen_args_fake = make_fake_varlen_args(varlen_m, varlen_k, False, None) or VarlenArguments()
+    # Fake operand tensors with sym_ints (varlen-aware shapes).
+    if varlen_m:
+        total_m_sym = cute.sym_int()
+        n_sym, k_sym, l_sym = cute.sym_int(), cute.sym_int(), cute.sym_int()
+        # Detect each operand's leading (stride-1) dim so m-major A / n-major B
+        # are accepted for varlen_m (MXFP8 only — fp4 is rejected upstream).
+        fake_mA = fake_tensor(
+            ab_dtype,
+            (total_m_sym, k_sym),
+            leading_dim=_leading_dim_from_stride(mA),
+            divisibility=div_for_dtype(ab_dtype),
+        )
+        fake_mB = fake_tensor(
+            ab_dtype,
+            (n_sym, k_sym, l_sym),
+            leading_dim=_leading_dim_from_stride(mB),
+            divisibility=div_for_dtype(ab_dtype),
+        )
+        fake_mD = fake_tensor(
+            d_dtype,
+            (total_m_sym, n_sym),
+            leading_dim=_leading_dim_from_stride(mD),
+            divisibility=div_for_dtype(d_dtype),
+        )
+    elif varlen_k:
+        total_k_sym = cute.sym_int()
+        m_sym, n_sym, l_sym = cute.sym_int(), cute.sym_int(), cute.sym_int()
+        # varlen_k uses MN-major A/B convention (stride 1 on M/N axis), but
+        # detect from the actual tensor so either layout works.
+        fake_mA = fake_tensor(
+            ab_dtype,
+            (m_sym, total_k_sym),
+            leading_dim=_leading_dim_from_stride(mA),
+            divisibility=div_for_dtype(ab_dtype),
+        )
+        fake_mB = fake_tensor(
+            ab_dtype,
+            (n_sym, total_k_sym),
+            leading_dim=_leading_dim_from_stride(mB),
+            divisibility=div_for_dtype(ab_dtype),
+        )
+        fake_mD = fake_tensor(
+            d_dtype,
+            (m_sym, n_sym, l_sym),
+            leading_dim=_leading_dim_from_stride(mD),
+            divisibility=div_for_dtype(d_dtype),
+        )
+    else:
+        # Detect each operand's leading (stride-1) dim so m-major A / n-major B
+        # are accepted along with the default k-major.
+        fake_mA = _make_fake_compact_tensor(
+            mA.shape, ab_dtype, leading_dim=_leading_dim_from_stride(mA)
+        )
+        fake_mB = _make_fake_compact_tensor(
+            mB.shape, ab_dtype, leading_dim=_leading_dim_from_stride(mB)
+        )
+        fake_mD = _make_fake_compact_tensor(
+            mD.shape, d_dtype, leading_dim=_leading_dim_from_stride(mD)
+        )
+    @cute.jit
+    def runner(
+        a: cute.Tensor,
+        b: cute.Tensor,
+        d: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        varlen_args,
+        stream,
+    ):
+        gemm(a, b, d, None, compile_epi_args, scheduler_args, varlen_args, stream, sfa, sfb, None)
+    compiled = cute.compile(
+        runner,
+        fake_mA,
+        fake_mB,
+        fake_mD,
+        _make_compile_tensor_like(mSFA, sf_dtype, dynamic_layout=True),
+        _make_compile_tensor_like(mSFB, sf_dtype, dynamic_layout=True),
+        varlen_args_fake,
+        stream,
+        options="--enable-tvm-ffi",
+    )
+    if varlen_m or varlen_k:
+        def run(a, b, d, sfa, sfb, cu_seqlens):
+            varlen_args = VarlenArguments(
+                mCuSeqlensM=cu_seqlens if varlen_m else None,
+                mCuSeqlensK=cu_seqlens if varlen_k else None,
+            )
+            compiled(a, b, d, sfa, sfb, varlen_args)
+    else:
+        def run(a, b, d, sfa, sfb):
+            compiled(a, b, d, sfa, sfb, VarlenArguments())
+    return run
+def blockscaled_gemm_reference(
+    a_ref: torch.Tensor,
+    b_ref: torch.Tensor,
+    sfa_ref: torch.Tensor,
+    sfb_ref: torch.Tensor,
+) -> torch.Tensor:
+    return torch.einsum(
+        "mkl,nkl->mnl",
+        torch.einsum("mkl,mkl->mkl", a_ref, sfa_ref),
+        torch.einsum("nkl,nkl->nkl", b_ref, sfb_ref),
+    )

build/torch-cuda/quack/broadcast_utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .layout_utils import make_acc_tensor_mn_view
 @cute.jit
 def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
     if const_expr(tCrC.element_type != Float32):  # Convert to f32
-        tCrC_f32 = cute.make_fragment(tCrC.shape, Float32)
         tCrC_f32.store(tCrC.load().to(Float32))
     else:
         tCrC_f32 = tCrC

 @cute.jit
 def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
     if const_expr(tCrC.element_type != Float32):  # Convert to f32
+        tCrC_f32 = cute.make_rmem_tensor(tCrC.shape, Float32)
         tCrC_f32.store(tCrC.load().to(Float32))
     else:
         tCrC_f32 = tCrC

build/torch-cuda/quack/cache_utils.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+"""Persistent .o cache for CuTe DSL compiled kernels.
+Compiled kernels are exported as object files (.o) via export_to_c.
+On subsequent runs the .o is loaded via tvm_ffi (~1ms) instead of
+re-generating IR + re-JIT'ing (~100ms per kernel).
+Controls:
+  QUACK_CACHE_ENABLED=0       — disable persistent .o cache (default: enabled)
+  QUACK_CACHE_DIR=path        — override default cache directory
+"""
+import fcntl
+import functools
+import hashlib
+import os
+import pickle
+import sys
+import tempfile
+import time
+from collections import namedtuple
+from getpass import getuser
+from pathlib import Path
+import cutlass
+import cutlass.cute as cute
+import tvm_ffi
+CACHE_ENABLED: bool = os.getenv("QUACK_CACHE_ENABLED", "1") == "1"
+CACHE_DIR: str | None = os.getenv("QUACK_CACHE_DIR", None)
+COMPILE_ONLY: bool = False
+# Downstream projects can append directories here to include their sources
+# in the cache fingerprint. Must be set before the first jit_cache call.
+EXTRA_SOURCE_DIRS: list[Path] = []
+EXPORT_FUNC_NAME = "func"
+LOCK_TIMEOUT = 60
+CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
+def _noop_kernel(*args, **kwargs):
+    pass
+def get_cache_path() -> Path:
+    if CACHE_DIR is not None:
+        cache_dir = Path(CACHE_DIR)
+    else:
+        cache_dir = Path(tempfile.gettempdir()) / getuser() / "quack_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def _hash_source_dir(h, root: Path) -> None:
+    """Hash all Python sources under *root* into *h*."""
+    for src in sorted(root.rglob("*.py")):
+        if not src.is_file():
+            continue
+        h.update(src.relative_to(root).as_posix().encode())
+        content = src.read_bytes()
+        h.update(len(content).to_bytes(8, "little"))
+        h.update(content)
+@functools.lru_cache(maxsize=1)
+def _compute_source_fingerprint() -> str:
+    """Hash quack + extra source dirs plus runtime ABI stamps into a fingerprint."""
+    h = hashlib.sha256()
+    h.update(f"py{sys.version_info.major}.{sys.version_info.minor}".encode())
+    h.update(f"cutlass={cutlass.__version__}".encode())
+    h.update(f"tvm_ffi={tvm_ffi.__version__}".encode())
+    _hash_source_dir(h, Path(__file__).resolve().parent)
+    for extra_dir in EXTRA_SOURCE_DIRS:
+        _hash_source_dir(h, Path(extra_dir).resolve())
+    return h.hexdigest()
+def _key_to_hash(key: tuple) -> str:
+    return hashlib.sha256(pickle.dumps(key)).hexdigest()
+# ---------------------------------------------------------------------------
+# File locking
+# ---------------------------------------------------------------------------
+class FileLock:
+    """Advisory file lock using fcntl.flock with timeout."""
+    def __init__(self, lock_path: Path, exclusive: bool, timeout: float = 15):
+        self.lock_path = lock_path
+        self.exclusive = exclusive
+        self.timeout = timeout
+        self._fd: int = -1
+    def __enter__(self) -> "FileLock":
+        flags = os.O_WRONLY | os.O_CREAT if self.exclusive else os.O_RDONLY | os.O_CREAT
+        lock_type = fcntl.LOCK_EX if self.exclusive else fcntl.LOCK_SH
+        self._fd = os.open(str(self.lock_path), flags)
+        deadline = time.monotonic() + self.timeout
+        while time.monotonic() < deadline:
+            try:
+                fcntl.flock(self._fd, lock_type | fcntl.LOCK_NB)
+                return self
+            except OSError:
+                time.sleep(0.1)
+        os.close(self._fd)
+        self._fd = -1
+        raise RuntimeError(f"Timed out waiting for lock: {self.lock_path}")
+    def __exit__(self, *exc) -> None:
+        if self._fd >= 0:
+            fcntl.flock(self._fd, fcntl.LOCK_UN)
+            os.close(self._fd)
+            self._fd = -1
+# ---------------------------------------------------------------------------
+# JIT cache decorator
+# ---------------------------------------------------------------------------
+def jit_cache(fn):
+    """Decorator that caches compiled CuTe DSL kernels in-memory and on disk.
+    The decorated function should return a compiled kernel (i.e. call cute.compile).
+    The disk cache key is (fn.__qualname__, *args, **sorted_kwargs).
+    """
+    cache = {}
+    hits = 0
+    misses = 0
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        nonlocal hits, misses
+        cache_key = args + tuple(sorted(kwargs.items())) if kwargs else args
+        # 1. In-memory hit
+        if cache_key in cache:
+            hits += 1
+            return _noop_kernel if COMPILE_ONLY else cache[cache_key]
+        # 2. Disk hit
+        disk_key = (fn.__qualname__,) + cache_key
+        if CACHE_ENABLED:
+            sha = _key_to_hash(disk_key)
+            cache_path = get_cache_path() / _compute_source_fingerprint()
+            cache_path.mkdir(parents=True, exist_ok=True)
+            o_path = cache_path / f"{sha}.o"
+            lock_path = cache_path / f"{sha}.lock"
+            try:
+                with FileLock(lock_path, exclusive=False, timeout=LOCK_TIMEOUT):
+                    if o_path.exists():
+                        m = cute.runtime.load_module(str(o_path), enable_tvm_ffi=True)
+                        loaded = m[EXPORT_FUNC_NAME]
+                        cache[cache_key] = loaded
+                        hits += 1
+                        return _noop_kernel if COMPILE_ONLY else loaded
+            except RuntimeError:
+                pass
+        # 3. Compile
+        misses += 1
+        compiled_fn = fn(*args, **kwargs)
+        # 4. Store
+        cache[cache_key] = compiled_fn
+        if CACHE_ENABLED:
+            try:
+                with FileLock(lock_path, exclusive=True, timeout=LOCK_TIMEOUT):
+                    if not o_path.exists():
+                        o_path.parent.mkdir(parents=True, exist_ok=True)
+                        compiled_fn.export_to_c(
+                            object_file_path=str(o_path),
+                            function_name=EXPORT_FUNC_NAME,
+                        )
+            except Exception as e:
+                print(f"quack cache: export failed for key {sha}: {e}")
+        return _noop_kernel if COMPILE_ONLY else compiled_fn
+    def cache_clear():
+        nonlocal hits, misses
+        cache.clear()
+        hits = 0
+        misses = 0
+    def cache_info():
+        return CacheInfo(hits=hits, misses=misses, maxsize=None, currsize=len(cache))
+    wrapper.cache = cache
+    wrapper.cache_clear = cache_clear
+    wrapper.cache_info = cache_info
+    return wrapper

build/torch-cuda/quack/copy_utils.py CHANGED Viewed

@@ -1,15 +1,25 @@
 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
-import re
-from typing import Optional, Type, Tuple, Callable
 import cutlass
 import cutlass.cute as cute
-from cutlass import Int32, Boolean, const_expr
-from cutlass.cute.nvgpu import cpasync, warpgroup
 from cutlass.cutlass_dsl import dsl_user_op
 import cutlass.pipeline
 @dsl_user_op
@@ -26,7 +36,7 @@ def cvt_copy(
 ) -> None:
     assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
     if const_expr(src.element_type != dst.element_type):
-        src_cvt = cute.make_fragment_like(src, dst.element_type)
         src_cvt.store(src.load().to(dst.element_type))
         src = src_cvt
     if const_expr(retile):
@@ -34,9 +44,33 @@ def cvt_copy(
     cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
 @dsl_user_op
 def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
-    dst = cute.make_fragment_like(src, src.element_type, loc=loc, ip=ip)
     cute.autovec_copy(src, dst, loc=loc, ip=ip)
     return dst
@@ -52,13 +86,23 @@ def load_s2r_retile(
 ) -> cute.Tensor:
     # Will also accept dst_shape being a tensor, in which case we write into that tensor
     if const_expr(not isinstance(dst_shape, cute.Tensor)):
-        dst = cute.make_fragment(dst_shape, src.element_type, loc=loc, ip=ip)
     else:
         dst = dst_shape
     cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
     return dst
 @dsl_user_op
 def get_copy_atom(
     dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
@@ -117,7 +161,7 @@ def tiled_copy_2d(
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
-    tApA = cute.make_fragment(
         cute.make_layout(
             (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
@@ -147,28 +191,108 @@ def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
 #     return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
-def parse_swizzle_from_pointer(ptr: cute.Pointer) -> Tuple[int, int, int]:
-    """Extract swizzle parameters from a pointer's swizzle_type.
-    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
-    b, m, s are the swizzle parameters (bits, base, shift).
-    Returns:
-        A cute.Swizzle object constructed from the extracted parameters
-    Raises:
-        ValueError: If the swizzle_type string cannot be parsed
-    """
-    # Ideally there should be a better API to get swizzle parameters, but we'll just parse
-    # the string here.
-    swizzle_str = str(ptr.type.swizzle_type)
-    # Extract the inner part "S<b,m,s>"
-    match = re.search(r"S<(\d+),(\d+),(\d+)>", swizzle_str)
-    if match:
-        b, m, s = int(match.group(1)), int(match.group(2)), int(match.group(3))
-        return b, m, s
     else:
-        raise ValueError(f"Could not parse swizzle_type: {swizzle_str}")
 def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
@@ -178,15 +302,16 @@ def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
 def swizzle_ptr(ptr: cute.Pointer):
-    b, m, s = parse_swizzle_from_pointer(ptr)
-    ptr_int = swizzle_int(ptr.toint(), b, m, s)
     return cute.make_ptr(ptr.dtype, ptr_int, ptr.memspace, assumed_align=ptr.alignment)
 def as_position_independent_swizzle_tensor(tensor: cute.Tensor) -> cute.Tensor:
     outer = tensor.layout
     width = tensor.element_type.width
-    inner = cute.make_swizzle(*parse_swizzle_from_pointer(tensor.iterator))
     # Need to recast the swizzle from byte (e.g. <3, 4, 3> to element units (e.g. <3, 3, 3> for
     # for 16 bits and <3, 2, 3> for 32 bits)
     new_layout = cute.recast_layout(
@@ -242,15 +367,16 @@ def sm90_get_smem_load_op(
         raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
     is_m_major = layout_c.is_m_major_c()
     if elem_ty_c.width == 16:
-        return cute.make_copy_atom(
-            cute.nvgpu.warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip
-        )
     else:
         return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
 def get_smem_store_atom(
-    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
 ) -> cute.CopyAtom:
     if const_expr(arch < 90 or element_type.width != 16):
         return cute.make_copy_atom(
@@ -259,14 +385,22 @@ def get_smem_store_atom(
             num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
         )
     else:
         return cute.make_copy_atom(
-            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
             element_type,
         )
 def get_smem_load_atom(
-    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
 ) -> cute.CopyAtom:
     if const_expr(arch < 90 or element_type.width != 16):
         return cute.make_copy_atom(
@@ -275,8 +409,13 @@ def get_smem_load_atom(
             num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
         )
     else:
         return cute.make_copy_atom(
-            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
             element_type,
         )
@@ -288,9 +427,10 @@ def get_smem_store_C(
     arch: int,
     transpose: bool = False,
     position_independent=False,
 ) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
     dtype = sC.element_type
-    copy_atom = get_smem_store_atom(arch, dtype, transpose)
     tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
     thr_copy = tiled_copy.get_slice(tidx)
     if const_expr(not position_independent):
@@ -298,8 +438,9 @@ def get_smem_store_C(
     else:
         tRS_sC = partition_D_position_independent(thr_copy, sC)
-    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
-        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], retile=True, **new_kwargs)
     return copy_fn, thr_copy, tRS_sC
@@ -324,14 +465,55 @@ def get_smem_load_C(
     thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
     tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
-    def copy_fn(src_idx: Int32, **new_kwargs):
-        return load_s2r_retile(
-            tiled_copy, tSR_sC[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
-        )
     return copy_fn, thr_copy, tSR_sC
 def get_smem_store_A(
     tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
 ) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
@@ -368,8 +550,6 @@ def get_smem_load_A(
         tSR_sA = thr_copy.partition_S(sA)
     else:
         tSR_sA = partition_S_position_independent(thr_copy, sA)
-    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
-    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
     tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
     def copy_fn(src_idx: Int32, **new_kwargs):
@@ -383,6 +563,195 @@ def get_smem_load_A(
     return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
 def tma_get_copy_fn(
     atom: cute.CopyAtom,
     cta_coord: cute.Coord,
@@ -391,6 +760,9 @@ def tma_get_copy_fn(
     dst_tensor: cute.Tensor,
     filter_zeros: bool = False,
     single_stage: bool = False,
     **kwargs,
 ) -> Callable:
     src_is_smem = const_expr(
@@ -407,17 +779,23 @@ def tma_get_copy_fn(
         cta_layout,
         cute.group_modes(smem_tensor, 0, group_rank_smem),
         cute.group_modes(gmem_tensor, 0, group_rank_gmem),
     )
     if const_expr(filter_zeros):
         s = cute.filter_zeros(s)
         g = cute.filter_zeros(g)
     src, dst = (s, g) if src_is_smem else (g, s)
-    def copy_tma(src_idx, dst_idx, **new_kwargs):
-        cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
-    def copy_tma_single_stage(**new_kwargs):
-        cute.copy(atom, src, dst, **new_kwargs, **kwargs)
     return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
@@ -438,22 +816,22 @@ def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsyn
 def gather_m_get_copy_fn(
     thr_copy_A: cute.ThrCopy,
     mA: cute.Tensor,  # (whatever, K)
-    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
     gsAIdx: cute.Tensor,  # (tile_M), either gmem or smem
     limit_m: Int32,
     limit_k: Int32,
 ) -> Callable:
-    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
-    tAsA = thr_copy_A.partition_D(sA)
     # k-major
     assert tAsA.shape[2] == 1
     tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
-    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
     if const_expr(not is_even_m_smem):
-        limit_m = min(limit_m, tile_shape_mk[0])
     elems_per_load = cute.size(tAsA.shape[0][0])
-    cA = cute.make_identity_tensor(tile_shape_mk)
     tAcA = thr_copy_A.partition_S(cA)
     t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
     # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
@@ -464,10 +842,10 @@ def gather_m_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
-    m_idx = cute.make_fragment(rows_per_thread, Int32)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         row_idx = tAcA[0, m, 0][0]
         if tApA_m[m]:
@@ -475,13 +853,13 @@ def gather_m_get_copy_fn(
         else:
             m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
-    mA_k = cute.logical_divide(mA, (None, tile_shape_mk[1]))
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
-            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         mA_cur = mA_k[None, (None, src_idx)]
@@ -506,7 +884,7 @@ def gather_m_get_copy_fn(
 def gather_k_get_copy_fn(
     thr_copy_A: cute.ThrCopy,
     mA: cute.Tensor,  # (tile_M, whatever)
-    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
     gsAIdx: cute.Tensor,  # (tile_K, RestK), either gmem or smem
     limit_m: Int32,
     limit_k: Int32,
@@ -538,7 +916,7 @@ def gather_k_get_copy_fn(
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
-    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
     threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
@@ -554,12 +932,12 @@ def gather_k_get_copy_fn(
         # Prefetch mAIdx early, even before smem is free
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         gAIdx_cur = gAIdx[None, src_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             if const_expr(not pred):
@@ -576,13 +954,13 @@ def gather_k_get_copy_fn(
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         tApA_k = None
         if const_expr(pred):
-            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
         sAIdx_cur = sAIdx[None, dst_idx]
-        k_idx = cute.make_fragment(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             k_idx[k] = sAIdx_cur[col_idx]
@@ -612,3 +990,194 @@ def gather_k_get_copy_fn(
     return copy_fn, prefetch_from_gmem_fn if const_expr(
         gAIdx is not None
     ) else prefetch_from_smem_fn

 # Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Optional, Type, Tuple, Callable, Sequence
+from functools import partial
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Int16, Boolean, const_expr
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
+from cutlass.cute.nvgpu.tcgen05.mma import CtaGroup  # noqa
 from cutlass.cutlass_dsl import dsl_user_op
 import cutlass.pipeline
+from cutlass._mlir.dialects import llvm
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
+from . import layout_utils
+from .utils import make_vector
+Sm100MmaPeerBitMask = 0xFEFFFFFF
 @dsl_user_op
 ) -> None:
     assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
     if const_expr(src.element_type != dst.element_type):
+        src_cvt = cute.make_rmem_tensor_like(src, dst.element_type)
         src_cvt.store(src.load().to(dst.element_type))
         src = src_cvt
     if const_expr(retile):
     cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def sr_cvt_copy(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    seed: Int32,
+    tidx: Int32,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """Like cvt_copy but uses stochastic rounding for FP32 -> BF16 conversion."""
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    from .rounding import convert_f32_to_bf16_sr
+    from cutlass.cute.tensor import TensorSSA
+    src_cvt = cute.make_rmem_tensor_like(src, dst.element_type)
+    src_vec = src.load()
+    raw_vec = convert_f32_to_bf16_sr(src_vec, seed, tidx, loc=loc, ip=ip)
+    src_cvt.store(TensorSSA(raw_vec, src_vec.shape, dst.element_type))
+    src = src_cvt
+    cute.copy(tiled_copy, src, dst, loc=loc, ip=ip)
 @dsl_user_op
 def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    dst = cute.make_rmem_tensor_like(src, src.element_type, loc=loc, ip=ip)
     cute.autovec_copy(src, dst, loc=loc, ip=ip)
     return dst
 ) -> cute.Tensor:
     # Will also accept dst_shape being a tensor, in which case we write into that tensor
     if const_expr(not isinstance(dst_shape, cute.Tensor)):
+        dst = cute.make_rmem_tensor(dst_shape, src.element_type, loc=loc, ip=ip)
     else:
         dst = dst_shape
     cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
     return dst
+@dsl_user_op
+def load_t2r(
+    thr_copy: cute.ThrCopy, shape: cute.Shape, src: cute.Tensor, *, loc=None, ip=None
+) -> cute.Tensor:
+    cDst = cute.make_identity_tensor(shape)
+    dst = cute.make_rmem_tensor(thr_copy.partition_D(cDst).shape, src.element_type, loc=loc, ip=ip)
+    cute.copy(thr_copy, src, dst, loc=loc, ip=ip)
+    return dst
 @dsl_user_op
 def get_copy_atom(
     dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
 @cute.jit
 def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
     # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
+    tApA = cute.make_rmem_tensor(
         cute.make_layout(
             (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
             stride=(cute.size(tAcA, mode=[2]), 0, 1),
 #     return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+# Ragged tensor trick for TMA: encodes variable-length sequences into a higher-rank
+# tensor so that TMA's out-of-bounds checking handles sequence boundaries.
+#
+# Given a tensor T with a ragged dimension (variable-length across batches), we create
+# a higher-rank tensor where the ragged dim is replaced with a fixed size `big_int`, and
+# extra dim(s) are appended. When indexing into a specific sequence at (offset, length),
+# `offset_ragged_tensor` computes coordinates such that:
+#   ragged_coord = big_int - length   (OOB check clamps reads past the sequence end)
+#   extra_coord(s) = f(offset, length) (selects the correct memory region)
+#
+# ptr_shift=True: 1-extra-dim approach (adds 1 dim, supports up to 4D input):
+#   Shape:  (*before, big_int, *after, max_int)
+#   Stride: (*original_strides, stride_r)     where stride_r = T.stride[ragged_dim]
+#   Pointer shifted backward by big_int * stride_r elements.
+#   Address for coords (big_int - length) in ragged dim, (offset + length) in extra dim:
+#     addr = (base - big_int * s_r) + (big_int - length) * s_r + (offset + length) * s_r
+#          = base + offset * s_r                                                      [correct]
+#   Works for epilogue TMA store. Does NOT work for TMA load with large big_int
+#   — the shifted pointer must land in physically mapped GPU memory.
+#
+# ptr_shift=False: 2-extra-dim approach (adds 2 dims, supports up to 3D input):
+#   Shape:  (*before, big_int, *after, max_int, max_int)
+#   Stride: (*before_strides, stride_r, *after_strides, 2^34 - stride_r, stride_r)
+#   No pointer shift. Uses 64-bit address wraparound to cancel the ragged offset.
+#   Let W = 2^34 - stride_r. Address for coords (big_int - length) in ragged dim,
+#   big_int in extra dim 0, (offset + length) in extra dim 1:
+#     addr = base + (big_int - length) * s_r + big_int * W + (offset + length) * s_r
+#          = base + big_int * (s_r + W) - length * s_r + (offset + length) * s_r
+#          = base + big_int * 2^34 + offset * s_r
+#   Since big_int = 2^30: big_int * 2^34 = 2^64 ≡ 0 (mod 2^64), so:
+#     addr = base + offset * s_r                                                      [correct]
+#   Works for all TMA paths since the base pointer is never shifted.
+#
+# Ragged tensor was adapted from the implementation from Triton, but here we have an option that
+# only needs 1 extra dimension instead of 2.
+# https://github.com/triton-lang/triton/blob/main/python/triton/tools/ragged_tma.py
+BIG_INT = 2**30
+MAX_INT = 2**31 - 1
+BIG_INT_INV = 2**64 // BIG_INT
+@dsl_user_op
+def create_ragged_tensor_for_tma(
+    T: cute.Tensor,
+    ragged_dim: int = 0,
+    ptr_shift: bool = False,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    rank = cute.rank(T)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    if ptr_shift:
+        assert rank <= 4, "ptr_shift ragged tensor only supports up to 4 dimensions"
+        new_shape = T.shape[:ragged_dim] + (BIG_INT,) + T.shape[ragged_dim + 1 :] + (MAX_INT,)
+        new_stride = T.stride + (T.stride[ragged_dim],)
+        ptr_offset = (None,) * ragged_dim + (-BIG_INT,) + (None,) * (rank - ragged_dim - 1)
+        new_ptr = cute.domain_offset(ptr_offset, T).iterator
+        return cute.make_tensor(new_ptr, cute.make_layout(new_shape, stride=new_stride))
+    else:
+        assert rank <= 3, "non-ptr_shift ragged tensor only supports up to 3 dimensions"
+        stride_r = T.stride[ragged_dim]
+        new_shape = (
+            T.shape[:ragged_dim] + (BIG_INT,) + T.shape[ragged_dim + 1 :] + (MAX_INT, MAX_INT)
+        )
+        new_stride = (
+            T.stride[:ragged_dim]
+            + (stride_r,)
+            + T.stride[ragged_dim + 1 :]
+            + (BIG_INT_INV - stride_r, stride_r)
+        )
+        return cute.make_tensor(T.iterator, cute.make_layout(new_shape, stride=new_stride))
+@dsl_user_op
+def offset_ragged_tensor(
+    T: cute.Tensor,
+    offset: Int32,
+    length: Int32,
+    ragged_dim: int = 0,
+    ptr_shift: bool = False,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    rank = cute.rank(T)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    big_int = cute.size(T, mode=[ragged_dim])
+    offset_val = big_int - length
+    if ptr_shift:
+        # 1-extra-dim: rank = original_rank + 1
+        assert rank >= ragged_dim + 2
+        offset_tuple = (None,) * ragged_dim + (offset_val,) + (None,) * (rank - ragged_dim - 2)
+        index_tuple = (None,) * (rank - 1) + (offset + length,)
     else:
+        # 2-extra-dim: rank = original_rank + 2, last 2 modes are the wraparound dims
+        assert rank >= ragged_dim + 3
+        offset_tuple = (None,) * ragged_dim + (offset_val,) + (None,) * (rank - ragged_dim - 3)
+        index_tuple = (None,) * (rank - 2) + (big_int, offset + length)
+    return cute.domain_offset(offset_tuple, T[index_tuple])
 def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
 def swizzle_ptr(ptr: cute.Pointer):
+    swz = ptr.type.swizzle_type
+    ptr_int = swizzle_int(ptr.toint(), swz.num_bits, swz.num_base, swz.num_shift)
     return cute.make_ptr(ptr.dtype, ptr_int, ptr.memspace, assumed_align=ptr.alignment)
 def as_position_independent_swizzle_tensor(tensor: cute.Tensor) -> cute.Tensor:
     outer = tensor.layout
     width = tensor.element_type.width
+    swizzle_type = tensor.iterator.type.swizzle_type
+    inner = cute.make_swizzle(swizzle_type.num_bits, swizzle_type.num_base, swizzle_type.num_shift)
     # Need to recast the swizzle from byte (e.g. <3, 4, 3> to element units (e.g. <3, 3, 3> for
     # for 16 bits and <3, 2, 3> for 32 bits)
     new_layout = cute.recast_layout(
         raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
     is_m_major = layout_c.is_m_major_c()
     if elem_ty_c.width == 16:
+        return cute.make_copy_atom(warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip)
     else:
         return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
 def get_smem_store_atom(
+    arch: cutlass.Constexpr[int],
+    element_type: Type[cute.Numeric],
+    transpose: bool = False,
+    major_mode_size: Optional[int] = None,
 ) -> cute.CopyAtom:
     if const_expr(arch < 90 or element_type.width != 16):
         return cute.make_copy_atom(
             num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
         )
     else:
+        num_matrices = (
+            4
+            if major_mode_size is None or major_mode_size % 16 == 0
+            else (2 if major_mode_size % 8 == 0 else 1)
+        )
         return cute.make_copy_atom(
+            warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=num_matrices),
             element_type,
         )
 def get_smem_load_atom(
+    arch: cutlass.Constexpr[int],
+    element_type: Type[cute.Numeric],
+    transpose: bool = False,
+    major_mode_size: Optional[int] = None,
 ) -> cute.CopyAtom:
     if const_expr(arch < 90 or element_type.width != 16):
         return cute.make_copy_atom(
             num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
         )
     else:
+        num_matrices = (
+            4
+            if major_mode_size is None or major_mode_size % 16 == 0
+            else (2 if major_mode_size % 8 == 0 else 1)
+        )
         return cute.make_copy_atom(
+            warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=num_matrices),
             element_type,
         )
     arch: int,
     transpose: bool = False,
     position_independent=False,
+    major_mode_size: Optional[int] = None,
 ) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
     dtype = sC.element_type
+    copy_atom = get_smem_store_atom(arch, dtype, transpose, major_mode_size=major_mode_size)
     tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
     thr_copy = tiled_copy.get_slice(tidx)
     if const_expr(not position_independent):
     else:
         tRS_sC = partition_D_position_independent(thr_copy, sC)
+    def copy_fn(src: cute.Tensor, dst_idx: Optional[Int32] = None, **new_kwargs):
+        dst_tensor = tRS_sC if const_expr(dst_idx is None) else tRS_sC[None, None, None, dst_idx]
+        cvt_copy(tiled_copy, src, dst_tensor, retile=True, **new_kwargs)
     return copy_fn, thr_copy, tRS_sC
     thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
     tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
+    def copy_fn(src_idx: Optional[Int32] = None, **new_kwargs):
+        src_tensor = tSR_sC if const_expr(src_idx is None) else tSR_sC[None, None, None, src_idx]
+        return load_s2r_retile(tiled_copy, src_tensor, dst_shape=tRS_shape, **new_kwargs)
     return copy_fn, thr_copy, tSR_sC
+def epilog_smem_copy_atom(
+    tiled_mma: cute.TiledMma, epi_tile: cute.Shape, transpose: bool = False
+) -> cute.TiledCopy:
+    copy_atom_C = cute.make_copy_atom(
+        warp.StMatrix8x8x16bOp(transpose, num_matrices=4 if epi_tile[1] % 16 == 0 else 2),
+        cutlass.Float16,  # this is just to get the right source layout
+    )
+    tiled_copy_C_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
+    return tiled_copy_C_atom
+def get_smem_store_epi(
+    tiled_mma: cute.TiledMma,
+    epi_tile: cute.Shape,
+    sC: Optional[cute.Tensor],
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor, cute.Tensor]:
+    dtype = sC.element_type if const_expr(sC is not None) else cutlass.Float16
+    tiled_copy_C_atom = epilog_smem_copy_atom(tiled_mma, epi_tile)
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_S(copy_atom, tiled_copy_C_atom)
+    thr_copy = tiled_copy.get_slice(tidx)
+    tRS_sC = None
+    if const_expr(sC is not None):
+        if const_expr(not position_independent):
+            tRS_sC = thr_copy.partition_D(sC)
+        else:
+            tRS_sC = partition_D_position_independent(thr_copy, sC)
+    sC_shape = sC.shape[:2] if sC is not None else epi_tile
+    # (R2S, R2S_M, R2S_N, PIPE_C)
+    tRS_rC_shape = thr_copy.partition_S(cute.make_identity_tensor(sC_shape)).shape
+    tRS_rC = cute.make_rmem_tensor(tRS_rC_shape, tiled_mma.op.acc_dtype)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], **new_kwargs)
+    return copy_fn if const_expr(sC is not None) else None, thr_copy, tRS_sC, tRS_rC
 def get_smem_store_A(
     tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
 ) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
         tSR_sA = thr_copy.partition_S(sA)
     else:
         tSR_sA = partition_S_position_independent(thr_copy, sA)
     tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
     def copy_fn(src_idx: Int32, **new_kwargs):
     return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
+@dsl_user_op
+def cpasync_reduce_bulk_add_f32(
+    smem_ptr: cute.Pointer,
+    gmem_ptr: cute.Pointer,
+    store_bytes: int | Int32,
+    *,
+    loc=None,
+    ip=None,
+):
+    smem_ptr_i32 = smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    # cache_hint = cutlass.Int64(0x14F0000000000000)  # EVICT_LAST
+    llvm.inline_asm(
+        None,
+        [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value()],
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;",
+        "l,r,r",
+        # [gmem_ptr.llvm_ptr, smem_ptr_i32, Int32(store_bytes).ir_value(), cache_hint.ir_value()],
+        # "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [$0], [$1], $2, $3;",
+        # "l,r,r,l",
+        has_side_effects=True,
+        is_align_stack=False,
+    )
+@dsl_user_op
+def get_tma_desc_addr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    Get the address of the TMA descriptor embedded in a TMA Copy Atom.
+    Extracts the constant memory address of the TMA descriptor for use with
+    custom PTX instructions.
+    :param tma_atom: TMA Copy Atom from make_tiled_tma_atom
+    :return: Pointer to TMA descriptor in constant memory
+    Example:
+        >>> desc_ptr = get_tma_descriptor_address(tma_atom)
+    """
+    exec_atom = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    tma_desc_ptr_type = ir.Type.parse(
+        "!cute.ptr<!cute_nvgpu.tma_descriptor_tiled, generic, align<128>>"
+    )
+    return _cute_nvgpu_ir.get_tma_desc_addr(tma_desc_ptr_type, exec_atom, loc=loc, ip=ip)
+@dsl_user_op
+def tma_gather4_load(
+    tma_desc_ptr: cute.Pointer,
+    dst_smem_ptr: cute.Pointer,
+    mbarrier_ptr: cute.Pointer,
+    col_idx: Int32,
+    row_indices: Sequence[Int32],
+    *,
+    num_cta: int = 1,
+    multicast_mask=None,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform TMA gather4 load from global memory to shared memory.
+    Issues PTX instruction:
+    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
+        [dstMem], [tensorMap, {col_idx, row0, row1, row2, row3}], [smem_bar];
+    This loads 4 rows (specified by row_indices) from a 2D tensor at the given
+    column index into shared memory, using the TMA descriptor.
+    :param tma_desc_ptr: Pointer to TMA descriptor in constant memory (128-byte aligned)
+    :type tma_desc_ptr:  Pointer
+    :param dst_smem_ptr: Destination address in shared memory
+    :type dst_smem_ptr:  Pointer
+    :param mbarrier_ptr: Pointer to mbarrier in shared memory for completion tracking
+    :type mbarrier_ptr:  Pointer
+    :param col_idx:      Column index
+    :type col_idx:       Int32
+    :param row_indices:  Sequence of exactly 4 row indices
+    :type row_indices:   Sequence[Int32]
+    :param num_cta:      Number of CTAs participating (default: 1)
+    :type num_cta:       int
+    :param multicast_mask: Optional multicast mask
+    :type multicast_mask: Int16
+    Requirements:
+        - row_indices must contain exactly 4 elements
+        - Compute capability >= SM_100 (Blackwell)
+        - TMA descriptor must be properly initialized for 2D tensor
+    Example:
+        >>> from cutlass.cute.nvgpu import cpasync
+        >>> from cutlass.cute import core
+        >>>
+        >>> # Create TMA descriptor
+        >>> tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(...)
+        >>> tma_desc_ptr = get_tma_descriptor_address(tma_atom)
+        >>>
+        >>> # Compute indices (typically from kernel logic)
+        >>> col_idx = core.get(...) or 5  # Int32 value
+        >>> row_indices = [core.get(...) for _ in range(4)]  # 4 Int32 values
+        >>>
+        >>> # Gather 4 rows at computed column
+        >>> tma_gather4_load(
+        ...     tma_desc_ptr=tma_desc_ptr,
+        ...     dst_smem_ptr=smem_ptr,
+        ...     mbarrier_ptr=barrier_ptr,
+        ...     col_idx=col_idx,
+        ...     row_indices=row_indices
+        ... )
+    """
+    if len(row_indices) != 4:
+        raise ValueError(f"gather4 requires exactly 4 row indices, got {len(row_indices)}")
+    col_val = Int32(col_idx).ir_value()
+    row_vals = [Int32(row_idx).ir_value() for row_idx in row_indices]
+    # Convert pointers to integer addresses
+    desc_addr = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value()
+    dst_addr = dst_smem_ptr.toint(loc=loc, ip=ip).ir_value()
+    mbar_addr = mbarrier_ptr.toint(loc=loc, ip=ip)
+    if num_cta > 1:
+        # Executed by both CTAs. Set peer bit to 0 so that the
+        # transaction bytes will update CTA0's barrier.
+        mbar_addr = mbar_addr & Sm100MmaPeerBitMask
+    mbar_addr = mbar_addr.ir_value()
+    # Handle multicast_mask - may already be ir.Value or Python int
+    multicast_mask_val = None
+    if multicast_mask is not None:
+        multicast_mask_val = Int16(multicast_mask).ir_value()
+    assert multicast_mask_val is None, "multicast is not supported yet"
+    # Emit inline PTX for TMA gather4
+    # PTX: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
+    #      [dstMem], [tensorMap, {col, row0, row1, row2, row3}], [smem_bar];
+    ptx = (
+        f"cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::{num_cta} "
+        "[$0], [$1, {$2, $3, $4, $5, $6}], [$7];"
+    )
+    llvm.inline_asm(
+        None,
+        [
+            dst_addr,
+            desc_addr,
+            col_val,
+            row_vals[0],
+            row_vals[1],
+            row_vals[2],
+            row_vals[3],
+            mbar_addr,
+        ],
+        ptx,
+        "r,l,r,r,r,r,r,r",  # constraints: register, long, 6x register
+        has_side_effects=True,
+        is_align_stack=False,
+        loc=loc,
+        ip=ip,
+    )
+def cpasync_bulk_get_copy_fn(
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    single_stage: bool = False,
+    **kwargs,
+) -> Callable:
+    group_rank_src = const_expr(cute.rank(src_tensor) - (1 if not single_stage else 0))
+    group_rank_dst = const_expr(cute.rank(dst_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    src = cute.group_modes(src_tensor, 0, group_rank_src)
+    dst = cute.group_modes(dst_tensor, 0, group_rank_dst)
+    def copy_bulk(src_idx, dst_idx, tma_bar_ptr: cute.Pointer, **new_kwargs):
+        atom = cute.make_copy_atom(cpasync.CopyBulkG2SOp(), src.element_type)
+        with cute.arch.elect_one():
+            cute.copy(
+                atom,
+                src[None, src_idx],
+                dst[None, dst_idx],
+                mbar_ptr=tma_bar_ptr,
+                **new_kwargs,
+                **kwargs,
+            )
+    def copy_bulk_single_stage(tma_bar_ptr: cute.Pointer, **new_kwargs):
+        atom = cute.make_copy_atom(cpasync.CopyBulkG2SOp(), src.element_type)
+        with cute.arch.elect_one():
+            cute.copy(atom, src, dst, mbar_ptr=tma_bar_ptr, **new_kwargs, **kwargs)
+    return copy_bulk if const_expr(not single_stage) else copy_bulk_single_stage
+@dsl_user_op
 def tma_get_copy_fn(
     atom: cute.CopyAtom,
     cta_coord: cute.Coord,
     dst_tensor: cute.Tensor,
     filter_zeros: bool = False,
     single_stage: bool = False,
+    *,
+    loc=None,
+    ip=None,
     **kwargs,
 ) -> Callable:
     src_is_smem = const_expr(
         cta_layout,
         cute.group_modes(smem_tensor, 0, group_rank_smem),
         cute.group_modes(gmem_tensor, 0, group_rank_gmem),
+        loc=loc,
+        ip=ip,
     )
     if const_expr(filter_zeros):
         s = cute.filter_zeros(s)
         g = cute.filter_zeros(g)
     src, dst = (s, g) if src_is_smem else (g, s)
+    @dsl_user_op
+    def copy_tma(src_idx, dst_idx, *, loc=None, ip=None, **new_kwargs):
+        cute.copy(
+            atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs, loc=loc, ip=ip
+        )
+    @dsl_user_op
+    def copy_tma_single_stage(*, loc=None, ip=None, **new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs, loc=loc, ip=ip)
     return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
 def gather_m_get_copy_fn(
     thr_copy_A: cute.ThrCopy,
     mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # (tile_M, tile_K, STAGE)
     gsAIdx: cute.Tensor,  # (tile_M), either gmem or smem
     limit_m: Int32,
     limit_k: Int32,
 ) -> Callable:
+    tile_M, tile_K = cute.size(sA, mode=[0]), cute.size(sA, mode=[1])
+    tAsA = partition_D_position_independent(thr_copy_A, sA)
     # k-major
     assert tAsA.shape[2] == 1
     tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
+    is_even_m_smem = tile_M % thr_copy_A.tiler_mn[0].shape == 0
     if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_M)
     elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor((tile_M, tile_K))
     tAcA = thr_copy_A.partition_S(cA)
     t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
     # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    m_idx = cute.make_rmem_tensor(rows_per_thread, Int32)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         row_idx = tAcA[0, m, 0][0]
         if tApA_m[m]:
         else:
             m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
+    mA_k = cute.logical_divide(mA, (None, tile_K))
     def copy_fn(src_idx, dst_idx, pred: bool = False):
         tApA_k = None
         if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_K
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         mA_cur = mA_k[None, (None, src_idx)]
 def gather_k_get_copy_fn(
     thr_copy_A: cute.ThrCopy,
     mA: cute.Tensor,  # (tile_M, whatever)
+    sA: cute.Tensor,  # (tile_M, tile_K, STAGE)
     gsAIdx: cute.Tensor,  # (tile_K, RestK), either gmem or smem
     limit_m: Int32,
     limit_k: Int32,
     # Read and cache indices for A
     rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
     cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_rmem_tensor(rows_per_thread, Boolean)
     for m in cutlass.range(rows_per_thread, unroll_full=True):
         tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
     threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
         # Prefetch mAIdx early, even before smem is free
         tApA_k = None
         if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         gAIdx_cur = gAIdx[None, src_idx]
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             if const_expr(not pred):
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         tApA_k = None
         if const_expr(pred):
+            tApA_k = cute.make_rmem_tensor(cols_per_thread, Boolean)
             limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
             for k in cutlass.range(cols_per_thread, unroll_full=True):
                 tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
         a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
         sAIdx_cur = sAIdx[None, dst_idx]
+        k_idx = cute.make_rmem_tensor(cols_per_thread, Int32)
         for k in cutlass.range(cols_per_thread):
             col_idx = tAcA[0, 0, k][1]
             k_idx[k] = sAIdx_cur[col_idx]
     return copy_fn, prefetch_from_gmem_fn if const_expr(
         gAIdx is not None
     ) else prefetch_from_smem_fn
+@cute.jit
+def gather_m_get_tma_copy_fn(
+    tma_atom: cute.CopyAtom,
+    mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # ((4, 32), (64, 1), STAGE)
+    sAIdx: cute.Tensor,  # (tile_M),
+    warp_idx: Int32,
+    num_warps: int,
+    num_cta: int = 1,
+) -> Callable:
+    tile_M = cute.size(sAIdx, mode=[0])
+    tile_K = cute.size(sA[None, None, 0]) // tile_M
+    assert tile_M % 4 == 0
+    # cta_group = 1 if tma_atom.op.cta_group == CtaGroup.ONE else 2
+    cta_group = num_cta  # Somehow all tma_atom has CtaGroup.ONE inside the kernel
+    copy_AIdx_s2r = cute.make_tiled_copy_tv(
+        cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Int32, num_bits_per_copy=128),
+        cute.make_layout(num_warps),  # thr_layout
+        cute.make_layout(4),  # val_layout
+    )
+    warp_copy_AIdx_s2r = copy_AIdx_s2r.get_slice(warp_idx)
+    tSR_sAIdx = warp_copy_AIdx_s2r.partition_S(sAIdx)
+    # ((4, 1), 8, (64, 1), STAGE)
+    tSR_sA = warp_copy_AIdx_s2r.partition_S(sA)
+    tSR_rAIdx = load_s2r(tSR_sAIdx)
+    tma_desc_ptr = get_tma_desc_addr(tma_atom)
+    tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
+    def copy_fn(src_idx, dst_idx, tma_bar_ptr: cute.Pointer):
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
+        col_idx = tile_K * src_idx
+        for m in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
+            row_indices = [tSR_rAIdx[v, m] for v in range(4)]
+            smem_ptr = tSR_sA_cur[None, m, None].iterator
+            with cute.arch.elect_one():
+                tma_gather4_load_fn(smem_ptr, tma_bar_ptr, col_idx, row_indices)
+    return copy_fn
+@cute.jit
+def gather_k_get_tma_copy_fn(
+    tma_atom: cute.CopyAtom,
+    sA: cute.Tensor,  # ((4, tile_K/4), (tile_M,), STAGE) — K-grouped load layout
+    sAIdx: cute.Tensor,  # (tile_K, a_prefetch_stage) — K indices in smem
+    col_idx: Int32,  # M offset in global tensor (contiguous dim for M-major)
+    warp_idx: Int32,
+    num_warps: int,
+    num_cta: int = 1,
+) -> Tuple[Callable, Callable]:
+    """Build a copy function for TMA gather4 in K dimension (M-major A).
+    Each gather4 instruction loads 4 K-columns × tile_M contiguous M-elements.
+    col_idx is the absolute M position in the global tensor.
+    K indices come from sAIdx (prefetched to smem by the scheduler warp).
+    Returns copy_fn(src_idx, dst_idx, tma_bar_ptr) which:
+      Issues gather4 calls with those K indices as row_indices
+    """
+    tile_K = cute.size(sAIdx, mode=[0])
+    assert tile_K % 4 == 0
+    cta_group = num_cta
+    # Tiled copy for loading K indices from smem to registers (4 per vector, across warps)
+    copy_AIdx_s2r = cute.make_tiled_copy_tv(
+        cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Int32, num_bits_per_copy=128),
+        cute.make_layout(num_warps),  # thr_layout
+        cute.make_layout(4),  # val_layout — 4 K indices per gather4
+    )
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    warp_copy_AIdx_s2r = copy_AIdx_s2r.get_slice(warp_idx)
+    tSR_sAIdx = warp_copy_AIdx_s2r.partition_S(sAIdx)  # (((4,1),4,4))
+    # ((4,1),4,(64,2),(1,4)):((64,0),1024,(1,4096),(0,8192))
+    tSR_sA = warp_copy_AIdx_s2r.partition_S(layout_utils.transpose_view(sA))
+    tma_desc_ptr = get_tma_desc_addr(tma_atom)
+    tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline,
+        src_idx,
+        dst_idx,
+        a_prefetch_consumer_state,
+    ) -> cute.Tensor:
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        tSR_rAIdx = load_s2r(tSR_sAIdx[None, None, dst_idx])
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return tSR_rAIdx
+    def copy_fn(src_idx, dst_idx, tSR_rAIdx, tma_bar_ptr: cute.Pointer):
+        # Issue gather4: col_idx = M position, row_indices = 4 K positions
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
+        gather_dim = cute.size(tSR_sA_cur, mode=[2, 0])  # Typically 64
+        for k in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
+            row_indices = [tSR_rAIdx[v, k] for v in range(4)]
+            for m in cutlass.range(cute.size(tSR_sA_cur, mode=[2, 1]), unroll_full=True):
+                smem_ptr = tSR_sA_cur[None, k, (None, m)].iterator
+                with cute.arch.elect_one():
+                    tma_gather4_load_fn(
+                        smem_ptr, tma_bar_ptr, col_idx + m * gather_dim, row_indices
+                    )
+    return copy_fn, prefetch_from_smem_fn
+# ---------------------------------------------------------------------------
+# Store helpers
+# ---------------------------------------------------------------------------
+@dsl_user_op
+@cute.jit
+def store(
+    ptr: cute.Pointer,
+    val,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Store a scalar value via cute.arch.store.
+    ptr:  cute.Pointer (any address space).
+    val:  DSL Numeric value.
+    pred: None → unconditional.  DSL Boolean → skipped when pred == 0.
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, type(val)(val), cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, type(val)(val), cop=cop, loc=loc, ip=ip)
+@dsl_user_op
+@cute.jit
+def store_v2(
+    ptr: cute.Pointer,
+    v0,
+    v1,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Vectorized store of 2 elements via cute.arch.store.
+    Packs v0, v1 into an MLIR <2 x T> vector.
+    ptr:  cute.Pointer (any address space, must be aligned for vector width).
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    vec = make_vector(type(v0), v0, v1, loc=loc, ip=ip)
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+@dsl_user_op
+@cute.jit
+def store_v4(
+    ptr: cute.Pointer,
+    v0,
+    v1,
+    v2,
+    v3,
+    pred: Optional[Boolean] = None,
+    cop: cutlass.Constexpr = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Vectorized store of 4 elements via cute.arch.store.
+    Packs v0–v3 into an MLIR <4 x T> vector.
+    ptr:  cute.Pointer (any address space, must be aligned for vector width).
+    cop:  Cache operator — "wb" (default), "cg", "cs" (streaming), "wt".
+    """
+    vec = make_vector(type(v0), v0, v1, v2, v3, loc=loc, ip=ip)
+    if const_expr(pred is None):
+        cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)
+    else:
+        if pred:
+            cute.arch.store(ptr.llvm_ptr, vec, cop=cop, loc=loc, ip=ip)

build/torch-cuda/quack/cross_entropy.py ADDED Viewed

	@@ -0,0 +1,716 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import math
+from functools import partial
+from typing import Optional, Type, Literal
+import torch
+from ._ops_compat import add_quack_op_namespace_prefix
+from torch import Tensor
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float32, Boolean, const_expr
+from . import utils as utils
+from . import copy_utils as copy_utils
+from . import layout_utils as layout_utils
+from .compile_utils import make_fake_tensor as fake_tensor
+from .reduce import row_reduce, online_softmax_reduce
+from .reduction_base import ReductionBase
+from .cache_utils import jit_cache
+from .cute_dsl_utils import torch2cute_dtype_map
+from cutlass.base_dsl import Arch
+class CrossEntropy(ReductionBase):
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int, online_softmax: bool = True):
+        self.online_softmax = online_softmax
+        # 2 stages: 1 for max, 1 for sum
+        super().__init__(
+            dtype,
+            N,
+            stage=2 if not self.online_softmax else 1,
+            reduction_dtype=Float32 if not self.online_softmax else Int64,
+        )
+        self.reload_from = None if N <= 16384 or self.online_softmax else "smem"
+    def _threads_per_row(self):
+        N = self.N
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (16384, 128)]:
+            if N <= limit:
+                return threads
+        return 256
+    def _set_cluster_n(self):
+        arch = cutlass.base_dsl.BaseDSL._get_dsl().get_arch_enum()
+        # SM8x (Ampere/Ada) lacks cluster support
+        if arch < Arch.sm_90:
+            self.cluster_n = 1
+            return
+        # SM12x supports cluster up to 8
+        max_cluster = 8 if arch.major == 12 else 16
+        N = self.N
+        if arch.major == 12 and const_expr(self.dtype.width >= 32):
+            # SM12x 99 KB SMEM: fp32 needs tighter clustering (same limits as fp16)
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
+        elif const_expr(self.dtype.width == 16):
+            thresholds = [(16 * 1024, 1), (32 * 1024, 2), (64 * 1024, 4), (128 * 1024, 8)]
+        else:
+            thresholds = [(16 * 1024, 1), (64 * 1024, 2), (128 * 1024, 4), (256 * 1024, 8)]
+        for limit, cluster in thresholds:
+            if N <= limit:
+                self.cluster_n = cluster
+                return
+        self.cluster_n = max_cluster
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mTargetLogit: Optional[cute.Tensor],  # (M, K) or (M,). If None, we use mX
+        mLoss: cute.Tensor,  # (M,)
+        mLSE: Optional[cute.Tensor],  # (M,)
+        mdX: Optional[cute.Tensor],  # (M, N) - if provided, compute gradient
+        ignore_index: Int32,  # Index to ignore in loss computation
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.dtype
+        if const_expr(mTargetLogit is None):
+            mTargetLogit = mX
+        if const_expr(mdX is not None):
+            assert mdX.element_type == self.dtype
+        self._set_cluster_n()
+        largest_dtype_width = const_expr(mX.element_type.width)
+        if const_expr(mdX is not None):
+            largest_dtype_width = const_expr(max(largest_dtype_width, mdX.element_type.width))
+        vecsize = math.gcd(self.N, 128 // largest_dtype_width)
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(vecsize=vecsize)
+        num_threads = tiled_copy.size
+        self.kernel(
+            mX,
+            mTarget,
+            mTargetLogit,
+            mLoss,
+            mLSE,
+            mdX,
+            ignore_index,
+            tiler_mn,
+            tiled_copy,
+            threads_per_row,
+        ).launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), self.cluster_n, 1],
+            block=[num_threads, 1, 1],
+            cluster=[1, self.cluster_n, 1] if const_expr(self.cluster_n > 1) else None,
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mTargetLogit: cute.Tensor,  # (M, K) or (M,)
+        mLoss: cute.Tensor,  # (M,)
+        mLSE: Optional[cute.Tensor],  # (M,)
+        mdX: Optional[cute.Tensor],  # (M, N) - if provided, compute gradient
+        ignore_index: Int32,  # Index to ignore in loss computation
+        tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        cluster_y = const_expr(0) if const_expr(self.cluster_n == 1) else cute.arch.block_idx()[1]
+        tv_layout = tiled_copy.layout_tv_tiled
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        # slice for CTAs
+        gX, cX = [cute.local_tile(mT, tiler_mn, (bidx, cluster_y)) for mT in (mX, idX)]
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        thr_copy = tiled_copy.get_slice(tidx)
+        tXgX = thr_copy.partition_S(gX)
+        tXsX = thr_copy.partition_D(sX)
+        tXcX = thr_copy.partition_S(cX)[(0, None), None, None]
+        tXrX = cute.make_rmem_tensor_like(tXgX)
+        is_even_N = const_expr(shape[1] == tiler_mn[1] * self.cluster_n)
+        tXpX = (
+            None if is_even_N else copy_utils.predicate_k(thr_copy.partition_S(cX), limit=shape[1])
+        )
+        copy = partial(copy_utils.copy, pred=tXpX)
+        num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
+        self._initialize_cluster(tidx, mbar_ptr, num_warps)
+        row = tXcX[0][0]
+        target = Int32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
+        if row < shape[0]:
+            copy(tXgX, tXsX, is_async=True)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        # Fill OOB values with -inf
+        if const_expr(not is_even_N):
+            utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+        target_logit = Float32.zero
+        should_ignore = Boolean(target == ignore_index)
+        if row < shape[0] and tXcX[0][1] == 0 and not should_ignore:
+            # Only load target logit if not ignoring this index
+            if const_expr(cute.rank(mTargetLogit.shape) == 2):
+                target_logit = Float32(mTargetLogit[row, target])
+            else:
+                assert cute.rank(mTargetLogit.shape) == 1
+                target_logit = Float32(mTargetLogit[row])
+        if const_expr(not self.online_softmax):
+            max_x = row_reduce(
+                x,
+                cute.ReductionOp.MAX,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr + 0 if const_expr(self.cluster_n > 1) else None,
+                init_val=-Float32.inf,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+            )
+            if const_expr(self.reload_from == "smem"):
+                cute.autovec_copy(tXsX, tXrX)
+                x = tXrX.load().to(Float32)
+            log2_e = math.log2(math.e)
+            # This would use ffma instead of fadd then fmul
+            exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=False)
+            denom = row_reduce(
+                exp_x,
+                cute.ReductionOp.ADD,
+                threads_per_row,
+                reduction_buffer[None, None, 1],
+                mbar_ptr + 1 if const_expr(self.cluster_n > 1) else None,
+                init_val=0.0,
+            )
+        else:
+            max_x, denom, exp_x = online_softmax_reduce(
+                x,
+                threads_per_row,
+                reduction_buffer[None, None, 0],
+                mbar_ptr,
+                hook_fn=cute.arch.cluster_wait if const_expr(self.cluster_n > 1) else None,
+                return_exp_x=const_expr(mdX is not None),
+            )
+        # Write loss and lse to gmem
+        if (
+            tXcX[0][1] == 0
+            and row < shape[0]
+            and (self.cluster_n == 1 or cute.arch.block_idx_in_cluster() == 0)
+        ):
+            lse = max_x + cute.math.log(denom, fastmath=True)
+            # Set loss to 0 if this index should be ignored, otherwise compute normally
+            loss_val = (lse - target_logit) if not should_ignore else Float32.zero
+            mLoss[row] = mLoss.element_type(loss_val)
+            if const_expr(mLSE is not None):
+                mLSE[row] = lse
+        # Compute gradient if mdX is provided
+        if const_expr(mdX is not None):
+            # Compute probabilities: exp(x) / sum(exp(x))
+            # If ignored, gradient should be zero
+            denom_inv = (
+                # 1.0 / denom
+                cute.arch.rcp_approx(denom)
+                if not (denom == 0.0 or denom != denom or should_ignore)
+                else Float32.zero
+            )
+            probs = exp_x * denom_inv
+            gdX = cute.local_tile(mdX, tiler_mn, (bidx, cluster_y))
+            tXgdX = thr_copy.partition_D(gdX)
+            tXrdX = cute.make_rmem_tensor_like(tXgdX)
+            tXcFull = thr_copy.partition_S(cX)
+            # Compute gradient: probs for all classes, (probs - 1) for target class
+            # If ignored, gradient is already zero
+            tXrdX_f32 = cute.make_rmem_tensor_like(tXrX, Float32)
+            tXrdX_f32.store(probs)
+            if not should_ignore:
+                for i in cutlass.range(cute.size(tXrX), unroll_full=True):
+                    tXrdX_f32[i] = tXrdX_f32[i] if tXcFull[i][1] != target else tXrdX_f32[i] - 1.0
+            tXrdX.store(tXrdX_f32.load().to(tXrdX.element_type))
+            if row < shape[0]:
+                copy(tXrdX, tXgdX)
+@jit_cache
+def _compile_cross_entropy_fwd(
+    dtype, target_dtype, target_logit_dtype, N, has_lse, has_dx, target_logit_ndim
+):
+    batch_sym = cute.sym_int()
+    div = math.gcd(128 // dtype.width, N)
+    x_cute = fake_tensor(dtype, (batch_sym, N), div)
+    dx_cute = fake_tensor(dtype, (batch_sym, N), div) if has_dx else None
+    target_cute = fake_tensor(target_dtype, (batch_sym,))
+    if target_logit_dtype is not None:
+        if target_logit_ndim == 2:
+            target_logit_cute = fake_tensor(target_logit_dtype, (batch_sym, cute.sym_int()), div)
+        else:
+            target_logit_cute = fake_tensor(target_logit_dtype, (batch_sym,))
+    else:
+        target_logit_cute = None
+    loss_cute = fake_tensor(Float32, (batch_sym,))
+    lse_cute = fake_tensor(Float32, (batch_sym,)) if has_lse else None
+    # If there's dx, it's faster to not use online softmax since we want the exp(x - max)
+    cross_entropy_op = CrossEntropy(dtype, N, online_softmax=not has_dx)
+    return cute.compile(
+        cross_entropy_op,
+        x_cute,
+        target_cute,
+        target_logit_cute,
+        loss_cute,
+        lse_cute,
+        dx_cute,
+        Int32(0),  # ignore_index, just for compilation
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
+@torch.library.custom_op(add_quack_op_namespace_prefix("cross_entropy_fwd_out"), mutates_args={"loss", "lse", "dx"})
+def cross_entropy_fwd_out(
+    x: Tensor,
+    target: Tensor,
+    target_logit: Optional[Tensor],
+    loss: Tensor,
+    lse: Optional[Tensor],
+    dx: Optional[Tensor],
+    ignore_index: int = -100,
+) -> None:
+    """Cross entropy forward pass.
+    Args:
+        x: Input logits tensor of shape (M, N)
+        target: Target class indices tensor of shape (M,)
+        target_logit: (M, K) or (M,).
+            If provided, the target logit will be read from this tensor instead of x.
+        loss: Output loss tensor of shape (M,)
+        lse: Optional output log-sum-exp tensor of shape (M,)
+        dx: Optional output gradient tensor of shape (M, N)
+        ignore_index: Index to ignore in loss computation
+    Returns:
+        None (mutates loss, lse, and optionally dx in-place)
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert target.dim() == 1, "Target must be 1D"
+    assert x.is_cuda and target.is_cuda, "Tensors must be on CUDA device"
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
+    assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
+    if target_logit is not None:
+        assert target_logit.is_cuda, "Target logits must be on CUDA device"
+        assert target_logit.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    if dx is not None:
+        assert dx.is_cuda, "dx must be on CUDA device"
+    N = x.size(1)
+    dtype = torch2cute_dtype_map[x.dtype]
+    target_dtype = torch2cute_dtype_map[target.dtype]
+    target_logit_dtype = (
+        torch2cute_dtype_map[target_logit.dtype] if target_logit is not None else None
+    )
+    target_logit_ndim = target_logit.ndim if target_logit is not None else None
+    _compile_cross_entropy_fwd(
+        dtype,
+        target_dtype,
+        target_logit_dtype,
+        N,
+        lse is not None,
+        dx is not None,
+        target_logit_ndim,
+    )(x, target, target_logit, loss, lse, dx, Int32(ignore_index))
+@cross_entropy_fwd_out.register_fake
+def _cross_entropy_fwd_out_fake(
+    x: Tensor,
+    target: Tensor,
+    target_logit: Optional[Tensor],
+    loss: Tensor,
+    lse: Optional[Tensor],
+    dx: Optional[Tensor],
+    ignore_index: int = -100,
+) -> None:
+    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY and not isinstance(x.size(1), torch.SymInt):
+        N = x.size(1)
+        dtype = torch2cute_dtype_map[x.dtype]
+        target_dtype = torch2cute_dtype_map[target.dtype]
+        target_logit_dtype = (
+            torch2cute_dtype_map[target_logit.dtype] if target_logit is not None else None
+        )
+        target_logit_ndim = target_logit.ndim if target_logit is not None else None
+        _compile_cross_entropy_fwd(
+            dtype,
+            target_dtype,
+            target_logit_dtype,
+            N,
+            lse is not None,
+            dx is not None,
+            target_logit_ndim,
+        )
+        _compile_cross_entropy_backward(dtype, target_dtype, N)
+def cross_entropy_fwd(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    target_logit: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+    return_lse: bool = False,
+    return_dx: bool = False,
+    inplace_backward: bool = False,
+) -> torch.Tensor | tuple[torch.Tensor]:
+    M = x.size(0)
+    device = x.device
+    loss = torch.empty(M, device=device, dtype=torch.float32)
+    lse = torch.empty(M, device=device, dtype=torch.float32) if return_lse else None
+    dx = (torch.empty_like(x) if not inplace_backward else x) if return_dx else None
+    cross_entropy_fwd_out(x, target, target_logit, loss, lse, dx, ignore_index)
+    if return_lse and return_dx:
+        return loss, lse, dx
+    elif return_lse:
+        return loss, lse
+    elif return_dx:
+        return loss, dx
+    else:
+        return loss
+class CrossEntropyBackward:
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        self.dtype = dtype
+        self.N = N
+        self.vecsize = 128 // dtype.width
+    def _threads_per_row(self):
+        N = min(self.N, 16384)  # We split by blocks of 16k
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (16384, 128)]:
+            if N <= limit:
+                return threads
+        return 256
+    def _get_tiled_copy(self, vecsize: int):
+        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        N = min(self.N, 16384)
+        num_threads = 128 if N <= 16384 else 256
+        threads_per_row = self._threads_per_row()
+        cols_per_block = num_threads // threads_per_row
+        num_blocks_N = cute.ceil_div(N // vecsize, threads_per_row)
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
+        tiled_copy = copy_utils.tiled_copy_2d(
+            self.dtype, threads_per_row, num_threads, num_copy_elems=vecsize
+        )
+        return tiled_copy, tiler_mn, threads_per_row
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mTarget: cute.Tensor,
+        mDLoss: cute.Tensor,
+        mdX: cute.Tensor,
+        mLSE: cute.Tensor,
+        ignore_index: Int32,  # Index to ignore in gradient computation
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.dtype
+        assert mdX.element_type == self.dtype
+        # e.g. if self.N isn't divisible by 8 for bf16, we might use 64 bits (4 elements) copy
+        vecsize = math.gcd(self.N, 128 // self.dtype.width)
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(vecsize=vecsize)
+        num_threads = tiled_copy.size
+        # (M,) -> (M, N) with stride 0 in the N dimension
+        mDLoss, mTarget, mLSE = [
+            layout_utils.expand(X, dim=1, size=self.N) for X in (mDLoss, mTarget, mLSE)
+        ]
+        self.kernel(
+            mX,
+            mTarget,
+            mDLoss,
+            mdX,
+            mLSE,
+            ignore_index,
+            mX.shape,
+            tiler_mn,
+            tiled_copy,
+            threads_per_row,
+        ).launch(
+            grid=[
+                cute.ceil_div(mX.shape[0], tiler_mn[0]),
+                cute.ceil_div(mX.shape[1], tiler_mn[1]),
+                1,
+            ],
+            block=[num_threads, 1, 1],
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,  # (M, N)
+        mTarget: cute.Tensor,  # (M,)
+        mDLoss: cute.Tensor,  # (M,)
+        mdX: cute.Tensor,  # (M, N)
+        mLSE: cute.Tensor,  # (M,)
+        ignore_index: Int32,  # Index to ignore in gradient computation
+        shape: cute.Shape,
+        tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, _ = cute.arch.block_idx()
+        smem = cutlass.utils.SmemAllocator()
+        sX = smem.allocate_tensor(
+            mX.element_type, cute.make_ordered_layout(tiler_mn, order=(1, 0)), byte_alignment=16
+        )
+        idX = cute.make_identity_tensor(shape)
+        gX, gdX, cX = [cute.local_tile(mT, tiler_mn, (bidx, bidy)) for mT in (mX, mdX, idX)]
+        thr_copy = tiled_copy.get_slice(tidx)
+        tXgX = thr_copy.partition_S(gX)
+        tXsX = thr_copy.partition_D(sX)
+        tXcX = thr_copy.partition_S(cX)[(0, None), None, None]
+        tXcFull = thr_copy.partition_S(cX)
+        tXgdX = thr_copy.partition_D(gdX)
+        tXrX, tXrdX = [cute.make_rmem_tensor_like(thr) for thr in (tXgX, tXgdX)]
+        is_even_N = const_expr(shape[1] % tiler_mn[1] == 0)
+        tXpX = (
+            None if is_even_N else copy_utils.predicate_k(thr_copy.partition_S(cX), limit=shape[1])
+        )
+        copy = partial(copy_utils.copy, pred=tXpX)
+        row = tXcX[0][0]
+        if row < shape[0]:
+            copy(tXgX, tXsX, is_async=True)
+        cute.arch.cp_async_commit_group()
+        cute.arch.cp_async_wait_group(0)
+        if const_expr(not is_even_N):
+            utils.fill_oob(tXsX, tXpX, -tXsX.element_type.inf)
+        cute.autovec_copy(tXsX, tXrX)
+        x = tXrX.load().to(Float32)
+        target = Int32.zero
+        dloss = Float32.zero
+        lse = Float32.zero
+        if row < shape[0]:
+            target = Int32(mTarget[row])
+            should_ignore = Boolean(target == ignore_index)
+            # Set dloss to 0 if this index should be ignored
+            if not should_ignore:
+                dloss = Float32(mDLoss[row])
+            lse = Float32(mLSE[row])
+        log2_e = math.log2(math.e)
+        probs = cute.math.exp2(x * log2_e - (lse * log2_e), fastmath=True)
+        prob_shifted = probs - 1.0
+        mask = cute.make_rmem_tensor_like(tXrX, Boolean)
+        for i in cutlass.range(cute.size(tXcFull), unroll_full=True):
+            mask[i] = tXcFull[i][1] == target
+        grad = cute.where(mask.load(), prob_shifted, probs)
+        grad = grad * dloss
+        tXrdX.store(grad.to(tXrdX.element_type))
+        if row < shape[0]:
+            copy(tXrdX, tXgdX)
+@jit_cache
+def _compile_cross_entropy_backward(dtype, target_dtype, N):
+    batch_sym = cute.sym_int()
+    div = math.gcd(128 // dtype.width, N)
+    x_cute, dx_cute = [fake_tensor(dtype, (batch_sym, N), div)] * 2
+    target_cute = fake_tensor(target_dtype, (batch_sym,))
+    dloss_cute, lse_cute = [fake_tensor(Float32, (batch_sym,))] * 2
+    cross_entropy_backward_op = CrossEntropyBackward(dtype, N)
+    return cute.compile(
+        cross_entropy_backward_op,
+        x_cute,
+        target_cute,
+        dloss_cute,
+        dx_cute,
+        lse_cute,
+        Int32(0),  # ignore_index, just for compilation
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
+def _cross_entropy_backward(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    dx: torch.Tensor,
+    ignore_index=-100,
+) -> None:
+    """Cross entropy backward pass.
+    Args:
+        x: Input logits tensor of shape (M, N)
+        target: Target class indices tensor of shape (M,)
+        dloss: Upstream gradients tensor of shape (M,)
+        lse: Log-sum-exp values tensor of shape (M,)
+    Returns:
+        Input gradients tensor of shape (M, N)
+    """
+    assert x.dim() == 2, "Input must be 2D"
+    assert target.dim() == 1, "Target must be 1D"
+    assert dloss.dim() == 1, "dloss must be 1D"
+    assert lse.dim() == 1, "lse must be 1D"
+    assert x.shape[0] == target.shape[0], "Batch dimensions must match"
+    assert x.shape[0] == dloss.shape[0], "Batch dimensions must match"
+    assert x.shape[0] == lse.shape[0], "Batch dimensions must match"
+    assert x.is_cuda and target.is_cuda and dloss.is_cuda and lse.is_cuda, (
+        "Tensors must be on CUDA device"
+    )
+    assert x.dtype in [torch.float16, torch.bfloat16, torch.float32], "Unsupported input dtype"
+    assert target.dtype in [torch.int32, torch.int64], "Target must be int32 or int64"
+    N = x.size(1)
+    dtype = torch2cute_dtype_map[x.dtype]
+    target_dtype = torch2cute_dtype_map[target.dtype]
+    _compile_cross_entropy_backward(dtype, target_dtype, N)(
+        x, target, dloss, dx, lse, Int32(ignore_index)
+    )
+@torch.library.custom_op(add_quack_op_namespace_prefix("cross_entropy_bwd_out"), mutates_args={"dx"})
+def cross_entropy_bwd_out(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    dx: torch.Tensor,
+    ignore_index: int = -100,
+) -> None:
+    _cross_entropy_backward(x, target, dloss, lse, dx, ignore_index)
+@cross_entropy_bwd_out.register_fake
+def _cross_entropy_bwd_out_fake(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    dx: torch.Tensor,
+    ignore_index: int = -100,
+) -> None:
+    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY and not isinstance(x.size(1), torch.SymInt):
+        N = x.size(1)
+        dtype = torch2cute_dtype_map[x.dtype]
+        target_dtype = torch2cute_dtype_map[target.dtype]
+        _compile_cross_entropy_backward(dtype, target_dtype, N)
+def cross_entropy_bwd(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    dloss: torch.Tensor,
+    lse: torch.Tensor,
+    ignore_index: int = -100,
+    inplace_backward: bool = False,
+) -> None:
+    if inplace_backward and not torch.compiler.is_compiling():
+        dx = x
+        _cross_entropy_backward(
+            x=x, target=target, dloss=dloss, lse=lse, dx=x, ignore_index=ignore_index
+        )
+    else:
+        dx = torch.empty_like(x)
+        cross_entropy_bwd_out(
+            x=x, target=target, dloss=dloss, lse=lse, dx=dx, ignore_index=ignore_index
+        )
+    return dx
+class CrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, target, lse_partial=None, ignore_index=-100, inplace_backward=False):
+        if lse_partial is None:
+            loss, lse = cross_entropy_fwd(x, target, ignore_index=ignore_index, return_lse=True)
+        else:
+            # if we already compute partial lse, then to compute the final lse we treat
+            # @lse_partial as @x and @x as @target_logit
+            loss, lse = cross_entropy_fwd(
+                lse_partial, target, target_logit=x, ignore_index=ignore_index, return_lse=True
+            )
+        ctx.save_for_backward(x, target, lse)
+        ctx.ignore_index = ignore_index
+        ctx.inplace_backward = inplace_backward
+        return loss
+    @staticmethod
+    def backward(ctx, dloss):
+        x, target, lse = ctx.saved_tensors
+        dx = cross_entropy_bwd(
+            x, target, dloss, lse, ctx.ignore_index, inplace_backward=ctx.inplace_backward
+        )
+        return dx, None, None, None, None
+def cross_entropy(
+    x: torch.Tensor,
+    target: torch.Tensor,
+    lse_partial: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+    inplace_backward: bool = False,
+) -> torch.Tensor:
+    """Cross entropy loss with automatic differentiation support.
+    Args:
+        x: Input logits tensor of shape (M, N)
+        target: Target class indices tensor of shape (M,)
+        lse_partial: Optional precomputed log-sum-exp partial results
+        reduction: Specifies the reduction to apply to the output:
+            'none': no reduction will be applied (default)
+            'mean': the sum of the output will be divided by the number of elements
+            'sum': the output will be summed
+        inplace_backward: Whether to perform backward pass in-place
+        ignore_index: Index to ignore in loss computation (loss will be 0 for these indices)
+    Returns:
+        Cross entropy loss tensor:
+            - If reduction='none': tensor of shape (M,) with per-example losses
+            - If reduction='mean': scalar tensor with mean loss
+            - If reduction='sum': scalar tensor with sum of losses
+    """
+    loss = CrossEntropyFunction.apply(x, target, lse_partial, ignore_index, inplace_backward)
+    if reduction == "mean":
+        return loss.sum() / (target != ignore_index).sum().float()
+    elif reduction == "sum":
+        return loss.sum()
+    elif reduction == "none":
+        return loss
+    else:
+        raise ValueError(
+            f"Invalid reduction mode: {reduction}. Expected one of 'none', 'mean', or 'sum'"
+        )

build/torch-cuda/quack/cute_dsl_ptxas.py CHANGED Viewed

@@ -1,8 +1,16 @@
 """
 System ptxas replacement for CUTLASS DSL.
 Environment variables:
     CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
     CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
 """
 import os
@@ -16,29 +24,81 @@ import cutlass
 CUTE_DSL_PTXAS_PATH = os.environ.get("CUTE_DSL_PTXAS_PATH", None)
 VERBOSE = os.environ.get("CUTE_DSL_PTXAS_VERBOSE", "0") == "1"
 _original_load_cuda_library = None
 _user_wanted_ptx = False  # True if user originally set CUTE_DSL_KEEP_PTX=1
-def _log(msg):
     if VERBOSE:
         print(f"[ptxas] {msg}", file=sys.stderr)
 def _get_ptx(compiled_func) -> tuple[str, Path] | None:
-    """Find and read PTX file, stripping null bytes."""
     func_name = getattr(compiled_func, "function_name", None)
     if not func_name:
         return None
-    dump_dir = os.environ.get("CUTE_DSL_DUMP_DIR", Path.cwd())
-    for ptx_path in Path(dump_dir).glob(f"*{func_name}*.ptx"):
-        content = ptx_path.read_text().rstrip("\x00")
-        if ".entry " in content and content.rstrip().endswith("}"):
-            _log(f"Found PTX: {ptx_path}")
             return content, ptx_path
     return None
@@ -102,13 +162,15 @@ def _patched_load_cuda_library(self):
         _log(f"cudaLibraryLoadData failed ({err}), falling back to embedded ptxas")
         return _original_load_cuda_library(self)
-    # Register kernels on all devices
     _, cuda_load_to_device = self._get_cuda_init_and_load()
-    lib_ptr = ctypes.c_void_p(int(library))
     dev_id = ctypes.c_int32(0)
     err_val = ctypes.c_int32(0)
     args = (ctypes.c_void_p * 3)(
-        ctypes.cast(ctypes.pointer(lib_ptr), ctypes.c_void_p),
         ctypes.cast(ctypes.pointer(dev_id), ctypes.c_void_p),
         ctypes.cast(ctypes.pointer(err_val), ctypes.c_void_p),
     )
@@ -126,26 +188,50 @@ def _patched_load_cuda_library(self):
     if not _user_wanted_ptx:
         ptx_path.unlink(missing_ok=True)
-    return [cuda_runtime.cudaLibrary_t(lib_ptr.value)]
 def patch():
     """Install system ptxas hook. Call before importing cutlass."""
-    global _original_load_cuda_library, _user_wanted_ptx
     assert CUTE_DSL_PTXAS_PATH is not None
     if not os.path.isfile(CUTE_DSL_PTXAS_PATH) or not os.access(CUTE_DSL_PTXAS_PATH, os.X_OK):
         raise RuntimeError(f"ptxas not found: {CUTE_DSL_PTXAS_PATH}")
-    # Track if user originally wanted PTX kept
     _user_wanted_ptx = os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1"
-    # os.environ['CUTE_DSL_KEEP_PTX'] = '1'
     assert os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1", (
         "Require CUTE_DSL_KEEP_PTX=1 to use system's ptxas"
     )
-    cls = cutlass.cutlass_dsl.cuda_jit_executor.CudaDialectJitCompiledFunction
-    _original_load_cuda_library = cls._load_cuda_library
-    cls._load_cuda_library = _patched_load_cuda_library
-    _log("Patch applied")
-    return

 """
 System ptxas replacement for CUTLASS DSL.
+Usage::
+    CUTE_DSL_KEEP_PTX=1 CUTE_DSL_PTXAS_PATH=/usr/local/cuda/bin/ptxas pytest tests/
 Environment variables:
     CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
+    CUTE_DSL_KEEP_PTX      - Must be set to 1 before cutlass is imported
     CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
+    CUTE_DSL_DUMP_DIR      - Directory for dumped PTX files (default: cwd)
+    CUTE_DSL_KEEP_CUBIN    - Set to 1 to save compiled cubin files
 """
 import os
 CUTE_DSL_PTXAS_PATH = os.environ.get("CUTE_DSL_PTXAS_PATH", None)
+if CUTE_DSL_PTXAS_PATH:
+    os.environ["CUTE_DSL_KEEP_PTX"] = "1"
 VERBOSE = os.environ.get("CUTE_DSL_PTXAS_VERBOSE", "0") == "1"
 _original_load_cuda_library = None
+_original_create_tvm_ffi_function = None
 _user_wanted_ptx = False  # True if user originally set CUTE_DSL_KEEP_PTX=1
+def _log(msg: str):
     if VERBOSE:
         print(f"[ptxas] {msg}", file=sys.stderr)
+def _read_ptx(ptx_path: Path) -> str | None:
+    try:
+        return ptx_path.read_bytes().decode("utf-8", errors="ignore").rstrip("\x00")
+    except OSError as exc:
+        _log(f"Failed to read {ptx_path}: {exc}")
+        return None
+def _read_complete_ptx(ptx_path: Path) -> str | None:
+    content = _read_ptx(ptx_path)
+    if content is None or not content.rstrip().endswith("}"):
+        return None
+    return content
 def _get_ptx(compiled_func) -> tuple[str, Path] | None:
+    """Find dumped PTX for the compiled function."""
     func_name = getattr(compiled_func, "function_name", None)
     if not func_name:
+        _log("Compiled function is missing function_name")
         return None
+    dump_dir = Path(os.environ.get("CUTE_DSL_DUMP_DIR", Path.cwd()))
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    ptx_paths = sorted(
+        dump_dir.rglob("*.ptx"), key=lambda path: path.stat().st_mtime_ns, reverse=True
+    )
+    _log(f"Searching dumped PTX for {func_name} in {dump_dir}")
+    _log(f"Found {len(ptx_paths)} PTX candidate files in {dump_dir}")
+    # Strategy 1: match by filename
+    filename_matches = [ptx_path for ptx_path in ptx_paths if func_name in ptx_path.name]
+    if filename_matches:
+        _log(f"Found {len(filename_matches)} filename matches for {func_name}")
+        for ptx_path in filename_matches:
+            content = _read_complete_ptx(ptx_path)
+            if content is None:
+                continue
+            _log(f"Using PTX filename match for {func_name}: {ptx_path}")
+            return content, ptx_path
+    # Strategy 2: match by .entry directive inside PTX
+    entry_pattern = re.compile(rf"\.entry\s+{re.escape(func_name)}(?:\s|\()", re.MULTILINE)
+    for ptx_path in ptx_paths:
+        content = _read_complete_ptx(ptx_path)
+        if content is None:
+            continue
+        if entry_pattern.search(content):
+            _log(f"Found PTX for {func_name}: {ptx_path}")
             return content, ptx_path
+    # Strategy 3: use sole candidate as fallback
+    if len(ptx_paths) == 1:
+        content = _read_complete_ptx(ptx_paths[0])
+        if content is not None:
+            _log(f"Using sole PTX candidate for {func_name}: {ptx_paths[0]}")
+            return content, ptx_paths[0]
+    _log(f"No PTX found for function {func_name} in {dump_dir}")
     return None
         _log(f"cudaLibraryLoadData failed ({err}), falling back to embedded ptxas")
         return _original_load_cuda_library(self)
+    # Register kernels on all devices (must match cuda_load_to_device's void*** convention)
     _, cuda_load_to_device = self._get_cuda_init_and_load()
+    lib_handle = ctypes.c_void_p(int(library))
+    ptr_to_lib = ctypes.pointer(lib_handle)
+    ptr_to_ptr_to_lib = ctypes.pointer(ptr_to_lib)
     dev_id = ctypes.c_int32(0)
     err_val = ctypes.c_int32(0)
     args = (ctypes.c_void_p * 3)(
+        ctypes.cast(ptr_to_ptr_to_lib, ctypes.c_void_p),
         ctypes.cast(ctypes.pointer(dev_id), ctypes.c_void_p),
         ctypes.cast(ctypes.pointer(err_val), ctypes.c_void_p),
     )
     if not _user_wanted_ptx:
         ptx_path.unlink(missing_ok=True)
+    return [cuda_runtime.cudaLibrary_t(lib_handle.value)]
+def _patched_create_tvm_ffi_function(self):
+    # Ensure CUDA library is loaded before TVM FFI creation
+    if getattr(self, "_ptxas_cuda_library", None) is None:
+        self._ptxas_cuda_library = self._load_cuda_library()
+        _log(
+            f"Loaded {len(self._ptxas_cuda_library)} CUDA libraries before creating TVM FFI function"
+        )
+    return _original_create_tvm_ffi_function(self)
 def patch():
     """Install system ptxas hook. Call before importing cutlass."""
+    global _original_load_cuda_library, _original_create_tvm_ffi_function, _user_wanted_ptx
     assert CUTE_DSL_PTXAS_PATH is not None
     if not os.path.isfile(CUTE_DSL_PTXAS_PATH) or not os.access(CUTE_DSL_PTXAS_PATH, os.X_OK):
         raise RuntimeError(f"ptxas not found: {CUTE_DSL_PTXAS_PATH}")
     _user_wanted_ptx = os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1"
     assert os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1", (
         "Require CUTE_DSL_KEEP_PTX=1 to use system's ptxas"
     )
+    patched = False
+    cuda_jit_function_cls = cutlass.cutlass_dsl.cuda_jit_executor.CudaDialectJitCompiledFunction
+    if cuda_jit_function_cls._load_cuda_library is not _patched_load_cuda_library:
+        _original_load_cuda_library = cuda_jit_function_cls._load_cuda_library
+        cuda_jit_function_cls._load_cuda_library = _patched_load_cuda_library
+        patched = True
+    from cutlass.cutlass_dsl.tvm_ffi_provider import TVMFFIJitCompiledFunctionBase
+    if (
+        TVMFFIJitCompiledFunctionBase._create_tvm_ffi_function
+        is not _patched_create_tvm_ffi_function
+    ):
+        _original_create_tvm_ffi_function = TVMFFIJitCompiledFunctionBase._create_tvm_ffi_function
+        TVMFFIJitCompiledFunctionBase._create_tvm_ffi_function = _patched_create_tvm_ffi_function
+        patched = True
+    if patched:
+        _log(f"Installed system ptxas patch with {CUTE_DSL_PTXAS_PATH}")
+    else:
+        _log("System ptxas patch already installed")

build/torch-cuda/quack/cute_dsl_utils.py CHANGED Viewed

@@ -1,9 +1,12 @@
 # Copyright (c) 2025, Tri Dao.
-from typing import Tuple
 from functools import lru_cache
 from dataclasses import dataclass, fields
 import torch
 try:
@@ -14,7 +17,7 @@ except ImportError:
 import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Int64, Float16, BFloat16, Float32
-from cutlass.base_dsl.typing import JitArgument
 from cutlass.cutlass_dsl import NumericMeta
@@ -25,6 +28,31 @@ load_cubin_module_data_og = cutlass.base_dsl.runtime.cuda.load_cubin_module_data
 cute_compile_og = cute.compile
 torch2cute_dtype_map = {
     torch.float16: Float16,
     torch.bfloat16: BFloat16,
@@ -39,66 +67,110 @@ def get_max_active_clusters(cluster_size):
     return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
 @lru_cache
-def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
     return torch.cuda.get_device_capability(device)
 @dataclass
 class ParamsBase:
     def __extract_mlir_values__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
         values, self._values_pos = [], []
-        for obj in non_constexpr_fields:
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
         return values
-    def __new_from_mlir_values__(self, values):
-        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
-        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
-        non_constexpr_fields = {
-            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
-        }
-        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
-            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
-            values = values[n_items:]
-        return self.__class__(**non_constexpr_fields, **constexpr_fields)
-@dataclass
-class ArgumentsBase(JitArgument):
-    def __c_pointers__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
-        c_ptrs = []
-        for obj in non_constexpr_fields:
-            if hasattr(obj, "__c_pointers__"):
-                c_ptrs.extend(obj.__c_pointers__())
-        return c_ptrs
-    def __get_mlir_types__(self):
-        all_fields = [getattr(self, field.name) for field in fields(self)]
-        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
-        types, self._values_pos = [], []
-        for obj in non_constexpr_fields:
-            if hasattr(obj, "__get_mlir_types__"):
-                obj_types = obj.__get_mlir_types__()
-                types.extend(obj_types)
-                self._values_pos.append(len(obj_types))
-            else:
-                self._values_pos.append(0)
-        return types
-    def __new_from_mlir_values__(self, values):
-        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
-        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
-        non_constexpr_fields = {
-            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
-        }
-        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
-            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
-            values = values[n_items:]
-        return self.__class__(**non_constexpr_fields, **constexpr_fields)

 # Copyright (c) 2025, Tri Dao.
+from typing import Tuple, get_origin
 from functools import lru_cache
 from dataclasses import dataclass, fields
+import os
+import re
 import torch
 try:
 import cutlass
 import cutlass.cute as cute
 from cutlass import Int32, Int64, Float16, BFloat16, Float32
+from cutlass.base_dsl.tvm_ffi_builder import spec
 from cutlass.cutlass_dsl import NumericMeta
 cute_compile_og = cute.compile
+# Patch TVM-FFI converter to handle Constexpr type annotations as compile-time constants.
+# Fields annotated with cutlass.Constexpr[T] are emitted as ConstNone (not runtime args).
+# At call time, pass None for these fields; the compile-time value is baked in.
+import cutlass.cute._tvm_ffi_args_spec_converter as _converter_module  # noqa
+_original_convert_single_arg = _converter_module._convert_single_arg
+def _patched_convert_single_arg(arg, arg_name, arg_type, ctx):
+    if arg_type is not None and get_origin(arg_type) is cutlass.Constexpr:
+        return spec.ConstNone(arg_name)
+    # If arg is a NamedTuple but arg_type doesn't have _fields (e.g. annotated as tuple),
+    # redirect so the converter uses the NamedTuple's own type hints.
+    if (
+        isinstance(arg, tuple)
+        and hasattr(type(arg), "_fields")
+        and (arg_type is None or not hasattr(arg_type, "_fields"))
+    ):
+        return _original_convert_single_arg(arg, arg_name, type(arg), ctx)
+    return _original_convert_single_arg(arg, arg_name, arg_type, ctx)
+_converter_module._convert_single_arg = _patched_convert_single_arg
 torch2cute_dtype_map = {
     torch.float16: Float16,
     torch.bfloat16: BFloat16,
     return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+def _parse_arch_str(arch_str: str) -> Tuple[int, int]:
+    """Parse arch string (e.g. 'sm_90', 'sm90', '90', 'sm_100a') to (major, minor) tuple."""
+    match = re.match(r"^(?:sm_?)?(\d+)(\d)([af]?)$", arch_str.strip(), re.IGNORECASE)
+    if not match:
+        raise ValueError(f"Invalid QUACK_ARCH format: {arch_str!r} (expected e.g. '90', 'sm_90')")
+    major, minor, _ = match.groups()
+    return int(major), int(minor)
 @lru_cache
+def _get_device_capacity_cached(device: torch.device = None) -> Tuple[int, int]:
+    """Return (major, minor) device capability.
+    Override with QUACK_ARCH (e.g. 'sm_90' or '90') for CPU-only compilation
+    without a GPU present.
+    """
+    arch_override = os.environ.get("QUACK_ARCH")
+    if arch_override is not None:
+        return _parse_arch_str(arch_override)
     return torch.cuda.get_device_capability(device)
+def get_device_capacity(
+    device: torch.device | torch.Tensor | None = None,
+) -> Tuple[int, int]:
+    """Return (major, minor) device capability.
+    Override with QUACK_ARCH (e.g. 'sm_90' or '90') for CPU-only compilation
+    without a GPU present.
+    Accepts either a ``torch.device`` or a tensor and canonicalizes to the
+    underlying device before consulting the cached helper. This avoids leaking
+    tensors through the LRU cache key.
+    """
+    if isinstance(device, torch.Tensor):
+        device = device.device
+    return _get_device_capacity_cached(device)
+def _partition_fields(obj):
+    """Split dataclass fields into (constexpr_dict, non_constexpr_dict) by type."""
+    all_fields = {field.name: getattr(obj, field.name) for field in fields(obj)}
+    constexpr = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
+    non_constexpr = {n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)}
+    return constexpr, non_constexpr
+def _new_from_mlir_values(self, values):
+    constexpr_fields, non_constexpr_fields = _partition_fields(self)
+    for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+        non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+        values = values[n_items:]
+    return self.__class__(**non_constexpr_fields, **constexpr_fields)
+def _namedtuple_new_from_mlir_values(self, values):
+    """Generic __new_from_mlir_values__ for NamedTuples.
+    Applied to NamedTuple classes via the ``@mlir_namedtuple`` decorator.
+    Fields that are None or Constexpr (StaticTypes) are preserved from ``self`` (the compile-time
+    template). Only non-static fields consume MLIR values. Multi-value fields (e.g. cute.Tensor)
+    consume the correct number of values via ``cutlass.new_from_mlir_values``.
+    Constexpr fields (annotated ``cutlass.Constexpr[T]``) are baked into the compiled kernel via
+    a converter patch (see above). At call time, pass None for these fields.
+    """
+    from cutlass.base_dsl.typing import get_mlir_types
+    values = list(values)
+    new_fields = []
+    for field_val in self:
+        if field_val is None or isinstance(field_val, StaticTypes):
+            new_fields.append(field_val)
+        else:
+            n_items = len(get_mlir_types(field_val))
+            new_fields.append(cutlass.new_from_mlir_values(field_val, values[:n_items]))
+            values = values[n_items:]
+    return self.__class__(*new_fields)
+def mlir_namedtuple(cls):
+    """Decorator that adds MLIR value reconstruction to a NamedTuple class.
+    Usage::
+        @mlir_namedtuple
+        class MyArgs(NamedTuple):
+            tensor_arg: cute.Tensor
+            const_arg: cutlass.Constexpr[int] = 0
+    """
+    cls.__new_from_mlir_values__ = _namedtuple_new_from_mlir_values
+    return cls
 @dataclass
 class ParamsBase:
     def __extract_mlir_values__(self):
+        _, non_constexpr_fields = _partition_fields(self)
         values, self._values_pos = [], []
+        for obj in non_constexpr_fields.values():
             obj_values = cutlass.extract_mlir_values(obj)
             values += obj_values
             self._values_pos.append(len(obj_values))
         return values
+    __new_from_mlir_values__ = _new_from_mlir_values

build/torch-cuda/quack/epi_composable.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) 2025, Tri Dao.
+"""ComposableEpiMixin: composes EpiOps into epilogue hook methods.
+Subclasses declare _epi_ops as a tuple of EpiOp instances. The mixin auto-generates
+epi_smem_bytes_per_stage, epi_get_smem_struct, epi_get_smem_tensors, epi_begin,
+epi_begin_loop, epi_end, and EpilogueParams by querying each op.
+epi_begin and epi_begin_loop return dicts keyed by op name, so epi_visit_subtile
+can access values by name (e.g. epi_loop_tensors["alpha"]).
+EpilogueParams is auto-generated from _epi_ops (via param_fields()) plus any
+_extra_param_fields declared on the subclass. Subclasses still define
+EpilogueArguments and epi_to_underlying_arguments manually.
+"""
+from dataclasses import make_dataclass, MISSING
+import cutlass.cute as cute
+from cutlass import const_expr
+from .epi_ops import EpiContext, Scalar
+def _compute_smem_map(ops):
+    """Pre-compute name → smem tensor index for each non-Scalar op."""
+    smem_map = {}
+    idx = 0
+    for op in ops:
+        if not isinstance(op, Scalar):
+            smem_map[op.name] = idx
+            idx += 1
+    return smem_map
+def _make_epi_params(epi_ops, extra_fields, bases):
+    """Build EpilogueParams dataclass from epi_ops + extra fields.
+    Required fields (default=MISSING) are placed first, then optional fields.
+    """
+    required, optional = [], []
+    for op in epi_ops:
+        for name, typ, default in op.param_fields():
+            (required if default is MISSING else optional).append((name, typ, default))
+    for name, typ, default in extra_fields:
+        (required if default is MISSING else optional).append((name, typ, default))
+    fields = [(n, t) for n, t, _ in required] + [(n, t, d) for n, t, d in optional]
+    return make_dataclass("EpilogueParams", fields, bases=bases)
+class ComposableEpiMixin:
+    """Base mixin that composes EpiOps into the standard epilogue hooks."""
+    _epi_ops = ()
+    _extra_param_fields = ()  # [(name, type, default), ...] for non-op params (e.g. act_fn)
+    _epi_param_bases = ()  # Base classes for EpilogueParams (e.g. (ParamsBase,))
+    _epi_smem_map = {}
+    _epi_has_async_ops = False
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if cls._epi_ops:
+            cls._epi_smem_map = _compute_smem_map(cls._epi_ops)
+            cls._epi_has_async_ops = any(op.needs_async_fence() for op in cls._epi_ops)
+            # Auto-generate EpilogueParams if not explicitly defined on this class
+            if "EpilogueParams" not in cls.__dict__:
+                cls.EpilogueParams = _make_epi_params(
+                    cls._epi_ops, cls._extra_param_fields, cls._epi_param_bases
+                )
+    # --- Host-side: args → params ---
+    def _epi_ops_to_params_dict(self, args):
+        """Merge each op's to_params into a single dict. Subclasses call this,
+        add custom fields, then construct self.EpilogueParams(**d)."""
+        d = {}
+        for op in self._epi_ops:
+            d.update(op.to_params(self, args))
+        return d
+    # --- Host-side: smem allocation (queried from ops) ---
+    @classmethod
+    def epi_smem_bytes_per_stage(cls, args, cta_tile_shape_mnk, epi_tile):
+        return sum(
+            op.smem_bytes(getattr(args, op.name, None), cta_tile_shape_mnk, epi_tile)
+            for op in cls._epi_ops
+        )
+    def epi_get_smem_struct(self, params):
+        fields = {}
+        for op in self._epi_ops:
+            result = op.smem_struct_field(self, params)
+            if result is not None:
+                name, ftype = result
+                fields[name] = ftype
+        EpiSharedStorage = type("EpiSharedStorage", (), {"__annotations__": fields})
+        return cute.struct(EpiSharedStorage)
+    def epi_get_smem_tensors(self, params, storage):
+        return tuple(
+            op.get_smem_tensor(self, params, storage.epi)
+            for op in self._epi_ops
+            if not isinstance(op, Scalar)
+        )
+    def epi_get_tma_atoms(self, params, *, loc=None, ip=None):
+        atoms = []
+        for op in self._epi_ops:
+            atoms.extend(op.tma_atoms(self, params))
+        return atoms
+    # --- Device-side: kernel execution (delegates to ops) ---
+    @cute.jit
+    def epi_begin(
+        self,
+        params,
+        epi_smem_tensors,
+        epi_tile,
+        tiled_copy_t2r,
+        tiled_copy_r2s,
+        tile_coord_mnkl,
+        varlen_manager,
+        epilogue_barrier,
+        tidx,
+    ):
+        ctx = EpiContext(
+            self,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        smem_map = self._epi_smem_map
+        results = {
+            op.name: op.begin(
+                self,
+                getattr(params, op.name, None),
+                epi_smem_tensors[smem_map[op.name]] if op.name in smem_map else None,
+                ctx,
+            )
+            for op in self._epi_ops
+        }
+        if const_expr(self._epi_has_async_ops):
+            has_async_data = any(
+                getattr(params, op.name, None) is not None
+                for op in self._epi_ops
+                if op.needs_async_fence()
+            )
+            if const_expr(has_async_data):
+                cute.arch.cp_async_commit_group()
+                cute.arch.cp_async_wait_group(0)
+                epilogue_barrier.arrive_and_wait()
+        return results
+    def epi_begin_loop(self, params, epi_tensors, epi_coord):
+        return {
+            op.name: op.begin_loop(self, epi_tensors[op.name], epi_coord) for op in self._epi_ops
+        }
+    @cute.jit
+    def epi_end(
+        self,
+        params,
+        epi_tensors,
+        epi_tile,
+        tiled_copy_t2r,
+        tiled_copy_r2s,
+        tile_coord_mnkl,
+        varlen_manager,
+        tidx,
+    ):
+        for op in self._epi_ops:
+            op.end(
+                self,
+                getattr(params, op.name, None),
+                epi_tensors[op.name],
+                epi_tile,
+                tiled_copy_t2r,
+                tiled_copy_r2s,
+                tile_coord_mnkl,
+                varlen_manager,
+                tidx,
+            )

build/torch-cuda/quack/epi_ops.py ADDED Viewed

	@@ -0,0 +1,648 @@

+# Copyright (c) 2025, Tri Dao.
+"""Composable epilogue operations (EpiOps) for GEMM kernels.
+Each EpiOp encapsulates a single tensor kind's behavior across the epilogue lifecycle:
+smem allocation, begin (one-time per-tile setup), begin_loop (per-subtile extraction),
+end (cleanup).
+The ops are composed via ComposableEpiMixin which iterates over a static _epi_ops tuple
+to generate epi_smem_bytes_per_stage, epi_get_smem_struct, epi_get_smem_tensors,
+epi_begin, and epi_begin_loop automatically.
+"""
+import math
+import operator
+from functools import partial
+import cutlass
+import cutlass.cute as cute
+from cutlass import Boolean, Float32, const_expr
+from .epi_utils import assume_stride_divisibility, setup_epi_tensor
+from .sm90_utils import partition_for_epilogue
+from . import utils as utils
+from . import copy_utils as copy_utils
+from . import layout_utils as layout_utils
+class EpiContext:
+    """Shared context passed to EpiOp.begin methods. Bundles common arguments."""
+    __slots__ = (
+        "epi_tile",
+        "tiled_copy_t2r",
+        "tiled_copy_r2s",
+        "tile_coord_mnkl",
+        "varlen_manager",
+        "epilogue_barrier",
+        "tidx",
+        "partition_for_epilogue_fn",
+        "num_epi_threads",
+        "batch_idx",
+        "tile_M",
+        "tile_N",
+    )
+    def __init__(
+        self,
+        gemm,
+        epi_tile,
+        tiled_copy_t2r,
+        tiled_copy_r2s,
+        tile_coord_mnkl,
+        varlen_manager,
+        epilogue_barrier,
+        tidx,
+    ):
+        self.epi_tile = epi_tile
+        self.tiled_copy_t2r = tiled_copy_t2r
+        self.tiled_copy_r2s = tiled_copy_r2s
+        self.tile_coord_mnkl = tile_coord_mnkl
+        self.varlen_manager = varlen_manager
+        self.epilogue_barrier = epilogue_barrier
+        self.tidx = tidx
+        self.tile_M = gemm.cta_tile_shape_mnk[0]
+        self.tile_N = gemm.cta_tile_shape_mnk[1]
+        self.batch_idx = tile_coord_mnkl[3]
+        self.num_epi_threads = gemm.num_epi_warps * cute.arch.WARP_SIZE
+        self.partition_for_epilogue_fn = partial(
+            partition_for_epilogue,
+            epi_tile=epi_tile,
+            tiled_copy=tiled_copy_t2r if tiled_copy_t2r is not None else tiled_copy_r2s,
+            tidx=tidx,
+            reference_src=tiled_copy_t2r is None,
+        )
+def _get_lane_warp_layouts(tiled_copy, reference_src=True):
+    """Derive lane and warp layouts along M and N from the epilogue tiled_copy.
+    Follows the CUTLASS Sm90RowReduction / Sm90ColReduction pattern.
+    Uses layout_src_tv_tiled (SM90, reference_src=True) or
+    layout_dst_tv_tiled (SM100, reference_src=False), matching the C++ impl's
+    get_layoutS_TV / get_layoutD_TV selection.
+    Returns (lane_layout_MN, warp_layout_MN) where each is a 2D layout (M, N):
+      lane_layout_MN[0] = lane_M: (lanes_in_M):(lane_stride_M) — e.g. 8:4
+      lane_layout_MN[1] = lane_N: (lanes_in_N):(lane_stride_N) — e.g. 4:1
+      warp_layout_MN[0] = warp_M: (warps_in_M):(warp_stride_M) — e.g. 4:1
+      warp_layout_MN[1] = warp_N: (warps_in_N):(warp_stride_N) — e.g. 1:0
+    For RowVecReduce (reduce along M): shuffle across lane_M, smem reduce across warp_M.
+    For ColVecReduce (reduce along N): shuffle across lane_N, direct write (warps_in_N == 1).
+    """
+    # right_inverse of the TV layout gives tile_element_idx -> tv_idx.
+    # SM90: use src (register) layout; SM100: use dst (smem) layout.
+    layout_tv = tiled_copy.layout_src_tv_tiled if reference_src else tiled_copy.layout_dst_tv_tiled
+    ref_layout = cute.right_inverse(layout_tv)
+    tile_M_size, tile_N_size = cute.size(tiled_copy.tiler_mn[0]), cute.size(tiled_copy.tiler_mn[1])
+    ref_layout_MN = cute.composition(
+        ref_layout, cute.make_layout((tile_M_size, tile_N_size))
+    )  # (tile_M, tile_N) -> tv_idx
+    num_warps = cute.size(tiled_copy) // cute.arch.WARP_SIZE
+    # tv2lane: tv_idx -> lane_idx  (lane = tv_idx % 32)
+    tv2lane = cute.make_layout((cute.arch.WARP_SIZE, num_warps, 1), stride=(1, 0, 0))
+    ref2lane = cute.composition(tv2lane, ref_layout_MN)  # (tile_M, tile_N) -> lane_idx
+    # select mode [0] = M part, [1] = N part; filter removes stride-0
+    lane_M = cute.filter(cute.select(ref2lane, [0]))  # lane_m -> lane_idx
+    lane_N = cute.filter(cute.select(ref2lane, [1]))  # lane_n -> lane_idx
+    lane_layout_MN = layout_utils.concat_layout(lane_M, lane_N)  # (lane_M, lane_N) -> lane_idx
+    # tv2warp: tv_idx -> warp_idx  (warp = tv_idx / 32)
+    tv2warp = cute.make_layout((cute.arch.WARP_SIZE, num_warps, 1), stride=(0, 1, 0))
+    ref2warp = cute.composition(tv2warp, ref_layout_MN)  # (tile_M, tile_N) -> warp_idx
+    warp_M = cute.filter(cute.select(ref2warp, [0]))  # warp_m -> warp_idx
+    warp_N = cute.filter(cute.select(ref2warp, [1]))  # warp_n -> warp_idx
+    warp_layout_MN = layout_utils.concat_layout(warp_M, warp_N)  # (warp_M, warp_N) -> warp_idx
+    return lane_layout_MN, warp_layout_MN
+class EpiOp:
+    """Base class for composable epilogue operations."""
+    def __init__(self, name):
+        self.name = name
+    # --- Host-side: args → params ---
+    def param_fields(self):
+        """Return [(field_name, type, default), ...] for auto-generating EpilogueParams.
+        Must match the keys returned by to_params()."""
+        return []
+    def to_params(self, gemm, args):
+        """Convert this op's arg field(s) to param dict entries.
+        Returns dict of {param_name: value}. Like EVT's to_underlying_arguments."""
+        return {}
+    # --- Host-side: smem allocation ---
+    def smem_bytes(self, arg_tensor, cta_tile_shape_mnk, epi_tile):
+        """Bytes of smem needed per stage. arg_tensor is the EpilogueArguments field."""
+        return 0
+    def smem_struct_field(self, gemm, params):
+        """Return (field_name, field_type) for @cute.struct, or None if no smem needed.
+        params is the full EpilogueParams object."""
+        return None
+    def get_smem_tensor(self, gemm, params, storage_epi):
+        """Extract smem tensor from storage.epi. Returns tensor or None.
+        params is the full EpilogueParams object."""
+        return None
+    def tma_atoms(self, gemm, params):
+        """Return list of TMA atoms for this op."""
+        return []
+    # --- Device-side: kernel execution ---
+    @cute.jit
+    def begin(self, gemm, param, smem_tensor, ctx):
+        """One-time per-tile setup. Returns state for begin_loop."""
+        return None
+    def begin_loop(self, gemm, state, epi_coord):
+        """Per-subtile extraction. Returns value for epi_visit_subtile."""
+        return state
+    def needs_async_fence(self):
+        """Whether this op issues async copies that need a fence."""
+        return False
+    def end(
+        self,
+        gemm,
+        param,
+        state,
+        epi_tile,
+        tiled_copy_t2r,
+        tiled_copy_r2s,
+        tile_coord_mnkl,
+        varlen_manager,
+        tidx,
+    ):
+        """Cleanup after all subtiles (reductions, direct writes)."""
+        pass
+class Scalar(EpiOp):
+    """Loads a scalar value or device pointer once per tile. No smem."""
+    def __init__(self, name, dtype=None):
+        super().__init__(name)
+        self.dtype = dtype
+    def param_fields(self):
+        return [(self.name, object, None)]
+    def to_params(self, gemm, args):
+        return {self.name: getattr(args, self.name)}
+    @cute.jit
+    def begin(self, gemm, param, smem_tensor, ctx):
+        result = None
+        if const_expr(param is not None):
+            result = (
+                utils.load_scalar_or_pointer(param, dtype=self.dtype)
+                if const_expr(self.dtype is not None)
+                else utils.load_scalar_or_pointer(param)
+            )
+        return result
+class VecLoad(EpiOp):
+    """Base class for broadcast vector loads (row or col) via cp_async.
+    Subclasses set `dim` to 0 (M/col) or 1 (N/row) and override `_get_gmem_vec`
+    for varlen handling.
+    """
+    dim = None  # 0 for col (M), 1 for row (N)
+    def param_fields(self):
+        return [(self.name, object, None)]
+    def to_params(self, gemm, args):
+        return {self.name: assume_stride_divisibility(getattr(args, self.name))}
+    def _tile_size(self, cta_tile_shape_mnk):
+        return cta_tile_shape_mnk[self.dim]
+    def _broadcast_stride(self):
+        # Row: stride (0,1) — broadcast along M. Col: stride (1,0) — broadcast along N.
+        return (0, 1) if self.dim == 1 else (1, 0)
+    def _tile_dim(self, ctx):
+        return ctx.tile_N if self.dim == 1 else ctx.tile_M
+    def _coord_idx(self):
+        return 1 if self.dim == 1 else 0
+    def smem_bytes(self, arg_tensor, cta_tile_shape_mnk, epi_tile):
+        if arg_tensor is None:
+            return 0
+        return self._tile_size(cta_tile_shape_mnk) * (arg_tensor.element_type.width // 8)
+    def smem_struct_field(self, gemm, params):
+        tensor = getattr(params, self.name, None)
+        if tensor is None:
+            size, dtype = 0, Float32
+        else:
+            size = self._tile_size(gemm.cta_tile_shape_mnk)
+            dtype = tensor.element_type
+        return (f"s_{self.name}", cute.struct.Align[cute.struct.MemRange[dtype, size], 16])
+    def get_smem_tensor(self, gemm, params, storage_epi):
+        if getattr(params, self.name, None) is None:
+            return None
+        return getattr(storage_epi, f"s_{self.name}").get_tensor(
+            cute.make_layout(self._tile_size(gemm.cta_tile_shape_mnk))
+        )
+    def needs_async_fence(self):
+        return True
+    def _get_gmem_vec(self, param, ctx):
+        """Get the global memory vector for this tile. Override for varlen."""
+        return param[ctx.batch_idx, None]
+    @cute.jit
+    def begin(self, gemm, param, smem_tensor, ctx):
+        tDsV = None
+        if const_expr(param is not None):
+            dtype = param.element_type
+            num_copy_elems = const_expr(max(32, dtype.width)) // dtype.width
+            thr_copy = copy_utils.tiled_copy_1d(
+                dtype, ctx.num_epi_threads, num_copy_elems, is_async=True
+            ).get_slice(ctx.tidx)
+            mVec = self._get_gmem_vec(param, ctx)
+            tile_dim = self._tile_dim(ctx)
+            coord_idx = ctx.tile_coord_mnkl[self._coord_idx()]
+            gVec = cute.local_tile(mVec, (tile_dim,), (coord_idx,))
+            tVgV = thr_copy.partition_S(gVec)
+            tVsV = thr_copy.partition_D(smem_tensor)
+            tVcV = thr_copy.partition_S(cute.make_identity_tensor(tile_dim))
+            limit = min(cute.size(mVec, mode=[0]) - coord_idx * tile_dim, tile_dim)
+            pred = cute.make_rmem_tensor((1, cute.size(tVsV.shape[1])), Boolean)
+            for m in cutlass.range(cute.size(tVsV.shape[1]), unroll_full=True):
+                pred[0, m] = tVcV[0, m] < limit
+            cute.copy(thr_copy, tVgV, tVsV, pred=pred)
+            tDsV = ctx.partition_for_epilogue_fn(
+                cute.make_tensor(
+                    smem_tensor.iterator,
+                    cute.make_layout((ctx.tile_M, ctx.tile_N), stride=self._broadcast_stride()),
+                )
+            )
+            if const_expr(ctx.tiled_copy_t2r is not None):
+                tDsV = ctx.tiled_copy_r2s.retile(tDsV)
+        return tDsV
+    @cute.jit
+    def begin_loop(self, gemm, state, epi_coord):
+        tDrV_cvt = None
+        if const_expr(state is not None):
+            tDsV_cur = cute.group_modes(state, 3, cute.rank(state))[None, None, None, epi_coord]
+            tDrV = cute.make_rmem_tensor(tDsV_cur.layout, tDsV_cur.element_type)
+            cute.autovec_copy(cute.filter_zeros(tDsV_cur), cute.filter_zeros(tDrV))
+            tDrV_cvt = cute.make_rmem_tensor_like(tDrV, gemm.acc_dtype)
+            tDrV_cvt.store(tDrV.load().to(gemm.acc_dtype))
+        return tDrV_cvt
+class RowVecLoad(VecLoad):
+    """Loads a row vector (N,) via cp_async, broadcasts along M with stride (0,1)."""
+    dim = 1
+class ColVecLoad(VecLoad):
+    """Loads a col vector (M,) via cp_async, broadcasts along N with stride (1,0).
+    Optimization: with N-major subtile loop, consecutive epi_n iterations for the same
+    epi_m share the same column data. The smem→register copy only runs when epi_n == 0.
+    Supports varlen_m via domain_offset.
+    """
+    dim = 0
+    @cute.jit
+    def _get_gmem_vec(self, param, ctx):
+        if const_expr(not ctx.varlen_manager.varlen_m):
+            mVec = param[ctx.batch_idx, None]
+        else:
+            mVec = cute.domain_offset(
+                (ctx.varlen_manager.params.cu_seqlens_m[ctx.batch_idx],), param
+            )
+        return mVec
+    @cute.jit
+    def begin(self, gemm, param, smem_tensor, ctx):
+        tDsV = None
+        tDrV_cvt = None
+        if const_expr(param is not None):
+            dtype = param.element_type
+            num_copy_elems = const_expr(max(32, dtype.width)) // dtype.width
+            thr_copy = copy_utils.tiled_copy_1d(
+                dtype, ctx.num_epi_threads, num_copy_elems, is_async=True
+            ).get_slice(ctx.tidx)
+            mVec = self._get_gmem_vec(param, ctx)
+            tile_dim = self._tile_dim(ctx)
+            coord_idx = ctx.tile_coord_mnkl[self._coord_idx()]
+            gVec = cute.local_tile(mVec, (tile_dim,), (coord_idx,))
+            tVgV = thr_copy.partition_S(gVec)
+            tVsV = thr_copy.partition_D(smem_tensor)
+            tVcV = thr_copy.partition_S(cute.make_identity_tensor(tile_dim))
+            # ColVec uses varlen-aware limit
+            limit = min(
+                ctx.varlen_manager.len_m(ctx.batch_idx) - coord_idx * tile_dim,
+                tile_dim,
+            )
+            pred = cute.make_rmem_tensor((1, cute.size(tVsV.shape[1])), Boolean)
+            for m in cutlass.range(cute.size(tVsV.shape[1]), unroll_full=True):
+                pred[0, m] = tVcV[0, m] < limit
+            cute.copy(thr_copy, tVgV, tVsV, pred=pred)
+            tDsV = ctx.partition_for_epilogue_fn(
+                cute.make_tensor(
+                    smem_tensor.iterator,
+                    cute.make_layout((ctx.tile_M, ctx.tile_N), stride=self._broadcast_stride()),
+                )
+            )
+            if const_expr(ctx.tiled_copy_t2r is not None):
+                tDsV = ctx.tiled_copy_r2s.retile(tDsV)
+            # Pre-allocate register tensor reused across begin_loop calls
+            tDsV_sub = cute.group_modes(tDsV, 3, cute.rank(tDsV))[None, None, None, 0]
+            tDrV_cvt = cute.make_rmem_tensor(tDsV_sub.layout, gemm.acc_dtype)
+        return [tDsV, tDrV_cvt]
+    @cute.jit
+    def begin_loop(self, gemm, state, epi_coord):
+        tDsV, tDrV_cvt = state[0], state[1]
+        if const_expr(tDsV is not None):
+            # Col vector is constant across N subtiles — only copy on first N subtile.
+            # Assumes N-major epi subtile order: epi_tile_layout = ordered_layout(..., order=(1,0))
+            epi_n = epi_coord[1]
+            if epi_n == 0:
+                tDsV_cur = cute.group_modes(tDsV, 3, cute.rank(tDsV))[None, None, None, epi_coord]
+                tDrV = cute.make_rmem_tensor(tDsV_cur.layout, tDsV_cur.element_type)
+                cute.autovec_copy(cute.filter_zeros(tDsV_cur), cute.filter_zeros(tDrV))
+                tDrV_cvt.store(tDrV.load().to(gemm.acc_dtype))
+        return tDrV_cvt
+class TileStore(EpiOp):
+    """Tile-sized output tensor stored via TMA (e.g. postact).
+    Args:
+        name: field name in EpilogueArguments/Params (e.g. "mPostAct")
+        epi_tile_fn: optional (gemm, epi_tile) -> epi_tile for half-tile (GemmGated)
+    """
+    def __init__(self, name, epi_tile_fn=None):
+        super().__init__(name)
+        self.epi_tile_fn = epi_tile_fn
+    def _tma_atom_key(self):
+        return f"tma_atom_{self.name}"
+    def _smem_layout_key(self):
+        return f"epi_{self.name}_smem_layout_staged"
+    def _epi_tile_key(self):
+        return f"epi_tile_{self.name}"
+    def param_fields(self):
+        from dataclasses import MISSING
+        return [
+            (self._tma_atom_key(), object, MISSING),
+            (self.name, object, MISSING),
+            (self._smem_layout_key(), object, MISSING),
+            (self._epi_tile_key(), object, MISSING),
+        ]
+    def to_params(self, gemm, args):
+        tensor = getattr(args, self.name)
+        epi_tile = self.epi_tile_fn(gemm, gemm.epi_tile) if self.epi_tile_fn else None
+        tma_atom, tma_tensor, smem_layout, epi_tile_out = setup_epi_tensor(
+            gemm, tensor, epi_tile=epi_tile
+        )
+        return {
+            self._tma_atom_key(): tma_atom,
+            self.name: tma_tensor,
+            self._smem_layout_key(): smem_layout,
+            self._epi_tile_key(): epi_tile_out,
+        }
+    def smem_bytes(self, arg_tensor, cta_tile_shape_mnk, epi_tile):
+        if arg_tensor is None:
+            return 0
+        if self.epi_tile_fn is not None:
+            epi_tile = self.epi_tile_fn(None, epi_tile)
+        return cute.size(cute.shape(epi_tile)) * (arg_tensor.element_type.width // 8)
+    def smem_struct_field(self, gemm, params):
+        smem_layout_key = self._smem_layout_key()
+        if not hasattr(params, smem_layout_key):
+            return (f"s_{self.name}", cute.struct.MemRange[Float32, 0])
+        return (
+            f"s_{self.name}",
+            cute.struct.Align[
+                cute.struct.MemRange[
+                    gemm.postact_dtype,
+                    cute.cosize(getattr(params, smem_layout_key)),
+                ],
+                gemm.buffer_align_bytes,
+            ],
+        )
+    def get_smem_tensor(self, gemm, params, storage_epi):
+        smem_layout_key = self._smem_layout_key()
+        if not hasattr(params, smem_layout_key):
+            return None
+        smem_layout = getattr(params, smem_layout_key)
+        return getattr(storage_epi, f"s_{self.name}").get_tensor(
+            smem_layout.outer,
+            swizzle=smem_layout.inner,
+        )
+    def tma_atoms(self, gemm, params):
+        tma_key = self._tma_atom_key()
+        if hasattr(params, tma_key):
+            return [getattr(params, tma_key)]
+        return []
+@cute.jit
+def vec_multiply(gemm, tRS_rD, tDrColVec, tDrRowVec):
+    """Multiply tRS_rD by colvec and/or rowvec in-place. Uses packed f32x2 on SM100+."""
+    if const_expr(tDrColVec is not None):
+        if const_expr(gemm.arch < 100):
+            for i in cutlass.range(cute.size(tDrColVec), unroll_full=True):
+                tRS_rD[i] *= tDrColVec[i]
+        else:
+            for i in cutlass.range(cute.size(tRS_rD) // 2, unroll_full=True):
+                tRS_rD[2 * i], tRS_rD[2 * i + 1] = cute.arch.mul_packed_f32x2(
+                    (tRS_rD[2 * i], tRS_rD[2 * i + 1]),
+                    (tDrColVec[2 * i], tDrColVec[2 * i + 1]),
+                )
+    if const_expr(tDrRowVec is not None):
+        if const_expr(gemm.arch < 100):
+            for i in cutlass.range(cute.size(tDrRowVec), unroll_full=True):
+                tRS_rD[i] *= tDrRowVec[i]
+        else:
+            for i in cutlass.range(cute.size(tRS_rD) // 2, unroll_full=True):
+                tRS_rD[2 * i], tRS_rD[2 * i + 1] = cute.arch.mul_packed_f32x2(
+                    (tRS_rD[2 * i], tRS_rD[2 * i + 1]),
+                    (tDrRowVec[2 * i], tDrRowVec[2 * i + 1]),
+                )
+@cute.jit
+def colvec_reduce_accumulate(gemm, tDrReduce, tRS_rInput, transform_fn=None, rScale=None):
+    """Accumulate transform_fn(input) or input * rScale into a ColVecReduce buffer.
+    If transform_fn is provided, accumulates transform_fn(input[i]).
+    If rScale is provided, accumulates input[i] * rScale[i] (uses mul/fma for SM100).
+    If neither, accumulates input directly (identity).
+    """
+    if const_expr(tDrReduce is not None):
+        if const_expr(transform_fn is None):
+            transform_fn = lambda x: x
+        if const_expr(gemm.arch < 100):
+            for i in cutlass.range(cute.size(tDrReduce), unroll_full=True):
+                val = transform_fn(tRS_rInput[i])
+                tDrReduce[i] += val * rScale[i] if const_expr(rScale is not None) else val
+        else:
+            tDrReduce_mn = layout_utils.convert_layout_zero_stride(tDrReduce, tDrReduce.layout)
+            tRS_rInput_mn = layout_utils.convert_layout_zero_stride(tRS_rInput, tDrReduce.layout)
+            if const_expr(rScale is not None):
+                rScale_mn = layout_utils.convert_layout_zero_stride(rScale, tDrReduce.layout)
+            for m in cutlass.range(cute.size(tDrReduce_mn, mode=[0]), unroll_full=True):
+                inp = lambda n: (tRS_rInput_mn[m, 2 * n], tRS_rInput_mn[m, 2 * n + 1])
+                val0 = transform_fn(inp(0))
+                if const_expr(rScale is not None):
+                    row_sum = cute.arch.mul_packed_f32x2(val0, (rScale_mn[m, 0], rScale_mn[m, 1]))
+                else:
+                    row_sum = val0
+                for n in cutlass.range(1, cute.size(tDrReduce_mn, mode=[1]) // 2, unroll_full=True):
+                    val = transform_fn(inp(n))
+                    if const_expr(rScale is not None):
+                        row_sum = cute.arch.fma_packed_f32x2(
+                            val, (rScale_mn[m, 2 * n], rScale_mn[m, 2 * n + 1]), row_sum
+                        )
+                    else:
+                        row_sum = cute.arch.add_packed_f32x2(val, row_sum)
+                tDrReduce_mn[m, 0] += row_sum[0] + row_sum[1]
+class ColVecReduce(EpiOp):
+    """Column vector reduction: accumulates across N subtiles in registers,
+    then warp-reduces and writes to gmem in epi_end.
+    No smem. The accumulation itself happens in epi_visit_subtile (user code).
+    This op handles the register allocation (begin), per-subtile slicing (begin_loop),
+    and final warp reduction + gmem write (end).
+    """
+    def param_fields(self):
+        return [(self.name, object, None)]
+    def to_params(self, gemm, args):
+        return {self.name: assume_stride_divisibility(getattr(args, self.name))}
+    @cute.jit
+    def begin(self, gemm, param, smem_tensor, ctx):
+        tDrReduce = None
+        if const_expr(param is not None):
+            colvec_mma_layout = cute.make_layout((ctx.tile_M, ctx.tile_N), stride=(1, 0))
+            tDrReduce_layout = ctx.partition_for_epilogue_fn(
+                cute.make_rmem_tensor(colvec_mma_layout, Float32)
+            ).layout
+            tDrReduce = cute.make_rmem_tensor(tDrReduce_layout, Float32)
+            cute.filter_zeros(tDrReduce).fill(0.0)
+        return tDrReduce
+    @cute.jit
+    def begin_loop(self, gemm, state, epi_coord):
+        result = None
+        if const_expr(state is not None):
+            result = cute.group_modes(state, 3, cute.rank(state))[None, None, None, epi_coord]
+        return result
+    @cute.jit
+    def end(
+        self,
+        gemm,
+        param,
+        state,
+        epi_tile,
+        tiled_copy_t2r,
+        tiled_copy_r2s,
+        tile_coord_mnkl,
+        varlen_manager,
+        tidx,
+    ):
+        """Intra-warp shuffle reduction across N lanes, then direct gmem write."""
+        if const_expr(param is not None):
+            tDrReduce = state
+            tiled_copy = tiled_copy_t2r if tiled_copy_t2r is not None else tiled_copy_r2s
+            reference_src = tiled_copy_t2r is None
+            # ── Derive lane layout from tiled_copy ──
+            lane_layout_MN, warp_layout_MN = _get_lane_warp_layouts(tiled_copy, reference_src)
+            # For ColVecReduce: reduce across N lanes (lanes_in_N threads share same M row)
+            lanes_in_N = cute.size(lane_layout_MN, mode=[1])
+            # Typically lanes_in_N is 4 for Sm90
+            assert lanes_in_N == 1 << int(math.log2(lanes_in_N)), (
+                "lanes_in_N must be a power of 2 for butterfly reduction"
+            )
+            # ── Intra-warp shuffle reduction across N lanes ──
+            if const_expr(lanes_in_N > 1):
+                assert lane_layout_MN.stride[1] == 1
+                tDrReduce_flt = cute.filter_zeros(tDrReduce)
+                for i in cutlass.range(cute.size(tDrReduce_flt), unroll_full=True):
+                    tDrReduce_flt[i] = cute.arch.warp_reduction(
+                        tDrReduce_flt[i], operator.add, threads_in_group=lanes_in_N
+                    )
+            warp_N = warp_layout_MN[1]
+            assert cute.size(warp_N) == 1, (
+                "ColVecReduce assumes all reduction cols are within the same warp"
+            )
+            # ── Direct gmem write (no inter-warp reduction needed: warps_in_N == 1) ──
+            partition_for_epilogue_fn = partial(
+                partition_for_epilogue,
+                epi_tile=epi_tile,
+                tiled_copy=tiled_copy,
+                tidx=tidx,
+                reference_src=tiled_copy_t2r is None,
+            )
+            tile_M, tile_N = gemm.cta_tile_shape_mnk[:2]
+            batch_idx = tile_coord_mnkl[3]
+            limit_n = param.shape[2] if not varlen_manager.varlen_m else param.shape[1]
+            if tile_coord_mnkl[1] < limit_n:
+                if const_expr(not varlen_manager.varlen_m):
+                    mColVec = param[batch_idx, None, tile_coord_mnkl[1]]
+                else:
+                    mColVec = cute.domain_offset(
+                        (varlen_manager.params.cu_seqlens_m[batch_idx],),
+                        param[None, tile_coord_mnkl[1]],
+                    )
+                gColVec = cute.local_tile(mColVec, (tile_M,), (tile_coord_mnkl[0],))
+                limit_m = min(
+                    varlen_manager.len_m(batch_idx) - tile_coord_mnkl[0] * tile_M,
+                    tile_M,
+                )
+                tDcD = partition_for_epilogue_fn(cute.make_identity_tensor((tile_M, tile_N)))
+                tDrReduce_m = layout_utils.convert_layout_zero_stride(tDrReduce, tDrReduce.layout)[
+                    None, 0
+                ]
+                tDcD_m = layout_utils.convert_layout_zero_stride(tDcD, tDrReduce.layout)[None, 0]
+                if tDcD_m[0][1] == 0:
+                    for m in cutlass.range(cute.size(tDcD_m, mode=[0])):
+                        row_idx = tDcD_m[m][0]
+                        if row_idx < limit_m:
+                            gColVec[row_idx] = tDrReduce_m[m]

build/torch-cuda/quack/epi_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) 2025, Tri Dao.
+"""Epilogue utilities: shared helpers for epilogue mixin classes."""
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.blackwell_helpers as sm100_utils
+from . import sm90_utils as sm90_utils
+from . import copy_utils as copy_utils
+def assume_stride_divisibility(tensor):
+    """Assume all strides are divisible by 32 bits (except static strides).
+    Used for broadcast vectors and similar tensors where stride alignment is guaranteed.
+    Returns a new tensor with the assumed strides.
+    """
+    if tensor is None:
+        return None
+    new_stride = tuple(
+        cute.assume(s, divby=32 // tensor.element_type.width) if not cute.is_static(s) else s
+        for s in tensor.stride
+    )
+    return cute.make_tensor(tensor.iterator, cute.make_layout(tensor.shape, stride=new_stride))
+def assume_broadcast_strides(*tensors):
+    """Apply stride divisibility assumptions to multiple broadcast vectors.
+    Returns a list with None preserved for None inputs.
+    """
+    return [assume_stride_divisibility(t) for t in tensors]
+def setup_epi_tensor(gemm, tensor, epi_tile=None, op_type="store"):
+    """Create TMA atom + smem layout for a supplemental epilogue tensor.
+    Args:
+        gemm: The GEMM object (provides arch, epi_stage, _make_tma_epi_atoms_and_tensors).
+        tensor: The global memory tensor to set up TMA for.
+        epi_tile: Epilogue tile shape. Defaults to gemm.epi_tile.
+        op_type: "store" or "load".
+    Returns:
+        (tma_atom, tma_tensor, smem_layout_staged, epi_tile)
+    """
+    if epi_tile is None:
+        epi_tile = gemm.epi_tile
+    dtype = tensor.element_type
+    layout = cutlass.utils.LayoutEnum.from_tensor(tensor)
+    utils_cls = sm100_utils if gemm.arch >= 100 else sm90_utils
+    smem_layout_staged = utils_cls.make_smem_layout_epi(dtype, layout, epi_tile, gemm.epi_stage)
+    tma_input = (
+        copy_utils.create_ragged_tensor_for_tma(tensor, ragged_dim=0, ptr_shift=True)
+        if cute.rank(tensor) == 2
+        else tensor
+    )
+    tma_atom, tma_tensor = gemm._make_tma_epi_atoms_and_tensors(
+        tma_input,
+        smem_layout_staged,
+        epi_tile,
+        op_type=op_type,
+    )
+    return tma_atom, tma_tensor, smem_layout_staged, epi_tile

build/torch-cuda/quack/fast_math.py CHANGED Viewed

@@ -1,80 +1,33 @@
 # Copyright (c) 2025, Tri Dao.
-from typing import Tuple
-from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
-from cutlass import Int32, Uint32
-from cutlass.cutlass_dsl import T, dsl_user_op
-from cutlass._mlir.dialects import llvm
-from .cute_dsl_utils import ParamsBase
-@cute.jit
-def clz(x: Int32) -> Int32:
-    # for i in cutlass.range_constexpr(32):
-    #     if (1 << (31 - i)) & x:
-    #         return Int32(i)
-    # return Int32(32)
-    # Early exit is not supported yet
-    res = Int32(32)
-    done = False
-    for i in cutlass.range(32):
-        if ((1 << (31 - i)) & x) and not done:
-            res = Int32(i)
-            done = True
-    return res
-def find_log2(x: Int32) -> Int32:
-    a: Int32 = Int32(31 - clz(x))
-    return a + ((x & (x - 1)) != 0)  # Round up, add 1 if not a power of 2.
-@dsl_user_op
-def umulhi(a: Int32, b: Int32, *, loc=None, ip=None) -> Uint32:
-    return Uint32(
-        llvm.inline_asm(
-            T.i32(),
-            [Int32(a).ir_value(loc=loc, ip=ip), Int32(b).ir_value(loc=loc, ip=ip)],
-            "mul.hi.u32 $0, $1, $2;",
-            "=r,r,r",
-            has_side_effects=False,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-        )
-    )
-@dataclass
-class FastDivmod(ParamsBase):
-    divisor: Int32
-    multiplier: Uint32
-    shift_right: Uint32
-    # called by host
-    @staticmethod
-    def create(divisor: Int32) -> "FastDivmod":
-        """Construct the FastDivmod object, in host code.
-        This precomputes some values based on the divisor and is computationally expensive.
-        """
-        p = Uint32(31 + find_log2(divisor))
-        divisor_u32 = Uint32(divisor)
-        multiplier = Uint32(((cutlass.Uint64(1) << p) + divisor_u32 - 1) // divisor_u32)
-        shift_right = Uint32(p - 32)
-        return FastDivmod(divisor, multiplier, shift_right)
-    @cute.jit
-    def div(self, dividend: Int32) -> Int32:
-        return (
-            Int32(umulhi(dividend, self.multiplier) >> self.shift_right)
-            if self.divisor != 1
-            else dividend
-        )
-    def divmod(self, dividend: Int32) -> Tuple[Int32, Int32]:
-        quotient = self.div(dividend)
-        remainder = dividend - quotient * self.divisor
-        return quotient, remainder

 # Copyright (c) 2025, Tri Dao.
 import cutlass
 import cutlass.cute as cute
+from cutlass.base_dsl.typing import Integer
+from cutlass.cutlass_dsl import dsl_user_op
+class FastDivmod(cute.FastDivmodDivisor):
+    """We store the divisor along with the FastDivmodDivisor."""
+    @dsl_user_op
+    def __init__(
+        self,
+        divisor: Integer,
+        is_power_of_2: bool = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        super().__init__(divisor, is_power_of_2=is_power_of_2, loc=loc, ip=ip)
+        self.divisor = divisor
+    def __extract_mlir_values__(self):
+        """Extract MLIR values for Host->Device transfer."""
+        return [self._divisor] + cutlass.extract_mlir_values(self.divisor)
+    def __new_from_mlir_values__(self, values):
+        """Reconstruct FastDivmodDivisor from MLIR values."""
+        new_obj = object.__new__(FastDivmod)
+        new_obj._divisor = values[0]
+        new_obj.divisor = cutlass.new_from_mlir_values(self.divisor, values[1:])
+        return new_obj

build/torch-cuda/quack/gemm.py CHANGED Viewed

@@ -1,16 +1,141 @@
 from typing import Optional
-from functools import partial
 from torch import Tensor
 import cutlass.cute as cute
-import cutlass.torch as cutlass_torch
-from cutlass import Float32
-from cutlass.cute.runtime import from_dlpack, make_ptr
-from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
-from .gemm_wrapper_utils import GemmWrapperBase
-from .gemm_default_epi import GemmDefaultSm90, GemmDefaultSm100
 def gemm(
@@ -26,6 +151,7 @@ def gemm(
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
     max_swizzle_size: int = 8,
     rowvec_bias: Optional[Tensor] = None,  # (l, n)
     colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
@@ -36,159 +162,121 @@ def gemm(
     A_idx: Optional[Tensor] = None,  # (total_m,) or (total_k,) indices for gather_A when varlen
     batch_idx_permute: Optional[Tensor] = None,  # (l,) permutation of batch indices for scheduler
     add_to_output: bool = False,
 ) -> None:
-    varlen = cu_seqlens_m is not None or cu_seqlens_k is not None
-    assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
-        "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
-    )
     gather_A = A_idx is not None
     if gather_A:
-        assert varlen, "gather_A requires varlen (cu_seqlens_m or cu_seqlens_k must be specified)"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
     if varlen:
         assert persistent, "varlen requires persistent=True"
     if add_to_output:
-        assert cu_seqlens_m is None, "Add to output not supported with varlen_m"
-    if cu_seqlens_m is not None:
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
-    if cu_seqlens_k is not None:
         assert A.stride(-2) == 1, "varlen_k requires A to be m-major"
         assert B.stride(-2) == 1, "varlen_k requires B to be n-major"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, D, C, cu_seqlens_m=cu_seqlens_m, cu_seqlens_k=cu_seqlens_k, A_idx=A_idx
     )
-    GemmWrapperBase.permute_tensors(
-        tensor_infos, varlen_m=cu_seqlens_m is not None, varlen_k=cu_seqlens_k is not None
     )
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    device_capacity = get_device_capacity(A.device)
-    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
-    GemmCls = GemmDefaultSm100 if device_capacity[0] > 9 else GemmDefaultSm90
-    acc_dtype = Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmCls.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    def scalar_arg(scalar: float | Tensor):
-        if isinstance(scalar, float):
-            return Float32(scalar) if scalar != 1.0 else None
         else:
-            assert isinstance(scalar, Tensor)
-            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-    epi_args = GemmCls.EpilogueArguments(
-        scalar_arg(alpha),
-        scalar_arg(beta),
-        mRowVecBroadcast=from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
-            leading_dim=1
-        )
-        if rowvec_bias is not None
-        else None,
-        mColVecBroadcast=from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
-            leading_dim=1 if cu_seqlens_m is None else 0
-        )
-        if colvec_bias is not None
-        else None,
-        add_to_output=add_to_output,
     )
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
         max_active_clusters,
         tile_count_semaphore,
         batch_idx_permute,
-        max_swizzle_size,
-    )
-    # Create varlen arguments if needed (assumes persistent=True when varlen)
-    varlen_args = GemmWrapperBase.create_varlen_args(
-        cu_seqlens_m,
-        cu_seqlens_k,
-        A_idx,
-        max_active_clusters,
-        cluster_shape_mnk,
-        tensor_infos,
-        GemmCls.num_epi_tensormaps,
-        pingpong,
     )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        None,  # activation
-        tile_shape_mn,
-        cluster_shape_mnk,
-        pingpong,
-        persistent,
-        tile_count_semaphore is not None,
-        device_capacity,
-        # Technically we don't need to recompile for different max_swizzle_size, but currently
-        # not recompiling will skew the autotuning results due to power throttling.
-        # Effectively we're recompiling as a way to pause between benchmarks during autotuning.
-        max_swizzle_size,
-        rowvec_bias.dtype if rowvec_bias is not None else None,
-        colvec_bias.dtype if colvec_bias is not None else None,
-        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
-        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
-        add_to_output,
-        cu_seqlens_m is not None,
-        cu_seqlens_k is not None,
-        gather_A,
-        batch_idx_permute is not None,
-        key_tensor_names=("A", "B", "D", "C"),
-    )
-    cache = gemm.compile_cache
-    if compile_key not in cache:
-        if device_capacity[0] == 9:
-            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
-        gemm_obj = GemmCls(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            gather_A=gather_A,
-        )
-        cache[compile_key] = cute.compile(
-            gemm_obj,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            varlen_args,
-            current_stream,
         )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        varlen_args,
-        current_stream,
-    )
-gemm.compile_cache = {}

+# Copyright (c) 2025-2026, Tri Dao.
+# GEMM compilation via TVM-FFI with fake tensors and NamedTuple args.
 from typing import Optional
 from torch import Tensor
 import cutlass.cute as cute
+from cutlass import Int32, Float32
+from cutlass.cute.runtime import make_ptr
+from .cache_utils import jit_cache
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters, torch2cute_dtype_map
+from .gemm_default_epi import (
+    GemmDefaultEpiMixin,
+    GemmDefaultSm90,
+    GemmDefaultSm100,
+    GemmDefaultSm120,
+)
+from .rounding import RoundingMode
+from .gemm_tvm_ffi_utils import (
+    get_majors,
+    get_dtypes,
+    perm3d,
+    make_scheduler_args,
+    make_varlen_args,
+    make_fake_scheduler_args,
+    make_fake_varlen_args,
+    make_fake_gemm_tensors,
+    compile_gemm_kernel,
+)
+@jit_cache
+def _compile_gemm(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    rowvec_dtype,
+    colvec_dtype,
+    colvec_ndim,
+    alpha_mode,
+    beta_mode,
+    add_to_output,
+    concat_layout,
+    varlen_m,
+    varlen_k,
+    gather_A,
+    use_tma_gather,
+    has_batch_idx_permute,
+    device_capacity,
+    rounding_mode,
+    sr_seed_mode,
+    has_trace_ptr,
+):
+    sm_to_cls = {
+        9: GemmDefaultSm90,
+        10: GemmDefaultSm100,
+        11: GemmDefaultSm100,
+        12: GemmDefaultSm120,
+    }
+    GemmCls = sm_to_cls[device_capacity[0]]
+    mA, mB, mD, mC, m, n, k, l = make_fake_gemm_tensors(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        varlen_m=varlen_m,
+        varlen_k=varlen_k,
+        gather_A=gather_A,
+    )
+    def fake_scalar(mode, dtype=Float32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(1.0 if dtype == Float32 else 0)
+        else:
+            return make_ptr(dtype, 0, cute.AddressSpace.gmem, assumed_align=4)
+    mRowVec = fake_tensor(rowvec_dtype, (l, n), leading_dim=1, divisibility=4)
+    if colvec_ndim == 2:
+        mColVec = fake_tensor(colvec_dtype, (l, m), leading_dim=1, divisibility=4)
+    elif colvec_ndim == 1:  # m is total_m in this case
+        mColVec = fake_tensor(colvec_dtype, (m,), leading_dim=0, divisibility=4)
+    else:
+        mColVec = None
+    epi_args = GemmCls.EpilogueArguments(
+        alpha=fake_scalar(alpha_mode),
+        beta=fake_scalar(beta_mode),
+        mRowVecBroadcast=mRowVec,
+        mColVecBroadcast=mColVec,
+        add_to_output=add_to_output,
+        rounding_mode=rounding_mode,
+        sr_seed=fake_scalar(sr_seed_mode, dtype=Int32),
+    )
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), has_batch_idx_permute, l
+    )
+    aidx_len = m if varlen_m else (k if varlen_k else None)
+    varlen_args = make_fake_varlen_args(varlen_m, varlen_k, gather_A, aidx_len)
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        gather_A,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        has_trace_ptr=has_trace_ptr,
+        use_tma_gather=use_tma_gather,
+        concat_layout=concat_layout or None,
+    )
 def gemm(
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
+    is_dynamic_persistent: bool = False,
     max_swizzle_size: int = 8,
     rowvec_bias: Optional[Tensor] = None,  # (l, n)
     colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
     A_idx: Optional[Tensor] = None,  # (total_m,) or (total_k,) indices for gather_A when varlen
     batch_idx_permute: Optional[Tensor] = None,  # (l,) permutation of batch indices for scheduler
     add_to_output: bool = False,
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int | Tensor = 0,
+    use_tma_gather: bool = False,
+    concat_layout: dict | None = None,
+    trace_ptr=None,  # Optional Int64 from TraceSession.ptr
 ) -> None:
+    varlen_m = cu_seqlens_m is not None
+    varlen_k = cu_seqlens_k is not None
+    varlen = varlen_m or varlen_k
     gather_A = A_idx is not None
+    assert not (varlen_m and varlen_k), "Only one of cu_seqlens_m and cu_seqlens_k"
     if gather_A:
+        assert varlen, "gather_A requires varlen"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
     if varlen:
         assert persistent, "varlen requires persistent=True"
     if add_to_output:
+        assert not varlen_m, "Add to output not supported with varlen_m"
+    if varlen_m:
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+    if varlen_k:
         assert A.stride(-2) == 1, "varlen_k requires A to be m-major"
         assert B.stride(-2) == 1, "varlen_k requires B to be n-major"
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if use_tma_gather:
+        assert device_capacity[0] in [10, 11], "TMA gather currently requires SM100/SM110"
+    if rounding_mode == RoundingMode.RS:
+        assert device_capacity[0] == 10, "Stochastic rounding (RoundingMode.RS) requires SM100"
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
+        )
+    A_p, B_p, D_p, C_p = perm3d(A, B, D, C, varlen_m=varlen_m, varlen_k=varlen_k)
+    a_major, b_major, d_major, c_major = get_majors(A_p, B_p, D_p, C_p)
+    a_dtype, b_dtype, d_dtype, c_dtype = get_dtypes(A, B, D, C)
+    alpha_mode = 2 if isinstance(alpha, Tensor) else (1 if alpha != 1.0 else 0)
+    beta_mode = 2 if isinstance(beta, Tensor) else (1 if beta != 1.0 else 0)
+    colvec_ndim = colvec_bias.ndim if colvec_bias is not None else 0
+    concat_layout = tuple(sorted(concat_layout)) if concat_layout else ()
+    sr_seed_mode = (
+        2 if isinstance(sr_seed, Tensor) else (1 if rounding_mode == RoundingMode.RS else 0)
     )
+    compiled_fn = _compile_gemm(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        (tile_M, tile_N),
+        (cluster_M, cluster_N, 1),
+        pingpong,
+        persistent,
+        is_dynamic_persistent,
+        torch2cute_dtype_map[rowvec_bias.dtype] if rowvec_bias is not None else None,
+        torch2cute_dtype_map[colvec_bias.dtype] if colvec_bias is not None else None,
+        colvec_ndim,
+        alpha_mode,
+        beta_mode,
+        add_to_output,
+        concat_layout,
+        varlen_m,
+        varlen_k,
+        gather_A,
+        use_tma_gather,
+        batch_idx_permute is not None,
+        device_capacity,
+        rounding_mode,
+        sr_seed_mode,
+        trace_ptr is not None,
     )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    def scalar_arg(scalar, mode, dtype=Float32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(scalar)
         else:
+            return scalar.data_ptr()
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    epi_args = GemmDefaultEpiMixin.EpilogueArguments(
+        alpha=scalar_arg(alpha, alpha_mode),
+        beta=scalar_arg(beta, beta_mode),
+        mRowVecBroadcast=rowvec_bias,
+        mColVecBroadcast=colvec_bias,
+        add_to_output=None,
+        rounding_mode=None,
+        sr_seed=scalar_arg(sr_seed, sr_seed_mode, dtype=Int32),
     )
+    scheduler_args = make_scheduler_args(
         max_active_clusters,
+        max_swizzle_size,
         tile_count_semaphore,
         batch_idx_permute,
     )
+    varlen_args = make_varlen_args(cu_seqlens_m, cu_seqlens_k, A_idx)
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(
+            A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None, None, trace_ptr
         )
+    else:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, trace_ptr)

build/torch-cuda/quack/gemm_act.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) 2025, Wentao Guo, Tri Dao.
-from typing import Tuple, Optional, Callable
 from functools import partial
-from dataclasses import dataclass
 from torch import Tensor
@@ -9,183 +9,85 @@ import cutlass
 import cutlass.cute as cute
 import cutlass.utils.hopper_helpers as sm90_utils_og
 import cutlass.utils.blackwell_helpers as sm100_utils
-from cutlass import Int32, Float32, Boolean, const_expr
-from cutlass.cutlass_dsl import if_generate
-import cutlass.torch as cutlass_torch
-from cutlass.cute.runtime import from_dlpack
-from .cute_dsl_utils import ArgumentsBase, ParamsBase
-from .varlen_utils import VarlenManager
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
 from .gemm_default_epi import GemmDefaultEpiMixin
-from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
-from .gemm_wrapper_utils import GemmWrapperBase
-from . import sm90_utils as sm90_utils
-from . import copy_utils as copy_utils
-from . import activation
 class GemmActMixin(GemmDefaultEpiMixin):
-    num_epi_tensormaps: int = 1
-    @dataclass
-    class EpilogueArguments(ArgumentsBase):
         mPostAct: cute.Tensor
         act_fn: cutlass.Constexpr[Optional[Callable]] = None
         alpha: Optional[Float32 | cute.Tensor] = None
         beta: Optional[Float32 | cute.Tensor] = None
         mRowVecBroadcast: Optional[cute.Tensor] = None
         mColVecBroadcast: Optional[cute.Tensor] = None
-    @dataclass
-    class EpilogueParams(ParamsBase):
-        tma_atom_postact: cute.CopyAtom
-        mPostAct_mnl: cute.Tensor
-        epi_postact_smem_layout_staged: cute.ComposedLayout
-        epi_tile_postact: cute.Tile
-        act_fn: cutlass.Constexpr[Optional[Callable]] = None
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
-        mRowVecBroadcast: Optional[cute.Tensor] = None
-        mColVecBroadcast: Optional[cute.Tensor] = None
-    def epi_to_underlying_arguments(
-        self, args: EpilogueArguments, *, loc=None, ip=None
-    ) -> EpilogueParams:
         self.postact_dtype = args.mPostAct.element_type
         self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
         self.cta_tile_shape_postact_mn = self.cta_tile_shape_mnk[:2]
-        epi_tile_postact = self.epi_tile
-        utils_cls = sm100_utils if self.arch == 100 else sm90_utils
-        epi_postact_smem_layout_staged = utils_cls.make_smem_layout_epi(
-            self.postact_dtype, self.postact_layout, epi_tile_postact, self.epi_stage
-        )
-        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
-            args.mPostAct,
-            epi_postact_smem_layout_staged,
-            epi_tile_postact,
-            op_type="store",
-        )
-        # Assume all strides are divisible by 32 bits except the last stride
-        new_stride = lambda t: tuple(
-            cute.assume(s, divby=32 // t.element_type.width) if not cute.is_static(s) else s
-            for s in t.stride
-        )
-        mRowVecBroadcast, mColVecBroadcast = [
-            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
-            if t is not None
-            else None
-            for t in (args.mRowVecBroadcast, args.mColVecBroadcast)
-        ]
-        return self.EpilogueParams(
-            tma_atom_postact,
-            tma_tensor_postact,
-            epi_postact_smem_layout_staged,
-            epi_tile_postact,
-            args.act_fn,
-            alpha=args.alpha,
-            beta=args.beta,
-            mRowVecBroadcast=mRowVecBroadcast,
-            mColVecBroadcast=mColVecBroadcast,
-        )
-    def epi_get_tma_atoms(
-        self, params: EpilogueParams, *, loc=None, ip=None
-    ) -> list[cute.CopyAtom]:
-        return [params.tma_atom_postact]
-    def epi_get_tensormap_update_shapes_orders(
         self,
-        params: EpilogueParams,
-        cu_seqlens_m: Optional[cute.Tensor],
-        batch_idx: Int32,
-        *,
-        loc=None,
-        ip=None,
-    ) -> tuple[list[Int32], list[int]]:
-        shapes = [cu_seqlens_m[batch_idx + 1] if cu_seqlens_m is not None else None]
-        orders = [0 if const_expr(self.postact_layout.is_m_major_c()) else 1]
-        return shapes, orders
-    @staticmethod
-    def epi_smem_bytes_per_stage(
-        args: EpilogueArguments, cta_tile_shape_mnk: Tuple[int, int, int], epi_tile: cute.Tile
-    ) -> int:
-        postact_dtype = args.mPostAct.element_type
-        postact_bytes_per_stage = cute.size(cute.shape(epi_tile)) * (postact_dtype.width // 8)
-        rowvec_colvec_bytes = GemmDefaultEpiMixin.epi_smem_bytes_per_stage(
-            args, cta_tile_shape_mnk, epi_tile
-        )
-        return postact_bytes_per_stage + rowvec_colvec_bytes
-    def epi_get_smem_struct(self, params: EpilogueParams):
-        row_vec_smem_size = 0 if params.mRowVecBroadcast is None else self.cta_tile_shape_mnk[1]
-        col_vec_smem_size = 0 if params.mColVecBroadcast is None else self.cta_tile_shape_mnk[0]
-        row_vec_dtype = (
-            params.mRowVecBroadcast.element_type if params.mRowVecBroadcast is not None else Float32
-        )
-        col_vec_dtype = (
-            params.mColVecBroadcast.element_type if params.mColVecBroadcast is not None else Float32
-        )
-        @cute.struct
-        class EpiSharedStorage:
-            sRowVec: cute.struct.Align[cute.struct.MemRange[row_vec_dtype, row_vec_smem_size], 16]
-            sColVec: cute.struct.Align[cute.struct.MemRange[col_vec_dtype, col_vec_smem_size], 16]
-            sPostAct: cute.struct.Align[
-                cute.struct.MemRange[
-                    self.postact_dtype, cute.cosize(params.epi_postact_smem_layout_staged)
-                ],
-                self.buffer_align_bytes,
-            ]
-        return EpiSharedStorage
-    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
-        sRowVec, sColVec = super().epi_get_smem_tensors(params, storage)
-        sPostAct = storage.epi.sPostAct.get_tensor(
-            params.epi_postact_smem_layout_staged.outer,
-            swizzle=params.epi_postact_smem_layout_staged.inner,
-        )
-        return (sRowVec, sColVec, sPostAct)
-    @cute.jit
-    def epilogue(
-        self,
-        params: EpilogueParams,
-        epi_smem_tensors: Tuple[cute.Tensor, ...],
-        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
-        epi_pipeline: cutlass.pipeline.PipelineAsync,
-        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
-        epi_read_state: cutlass.pipeline.PipelineState,
-        epi_producer_state: cutlass.pipeline.PipelineState,
-        epi_tile: cute.Tile,
-        load_acc_subtile: Callable,
-        tRS_rD: cute.Tensor,
-        tRS_rC: Optional[cute.Tensor],
-        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
-        tiled_copy_r2s: cute.TiledCopy,
-        tRS_sD: cute.Tensor,
-        tiled_copy_s2r: Optional[cute.TiledCopy],
-        tSR_rC: Optional[cute.Tensor],
-        tSR_sC: Optional[cute.Tensor],
-        copy_D: Optional[Callable],
-        copy_C: Optional[Callable],
-        tile_coord_mnkl: cute.Coord,
-        varlen_manager: VarlenManager,
-        epilogue_barrier: cutlass.pipeline.NamedBarrier,
-        tile_scheduler,
-        tidx: Int32,
-        is_tma_warp: Boolean,
-    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
-        has_C = const_expr(tRS_rC is not None)
-        has_D = const_expr(copy_D is not None)
-        tma_atom_postact = params.tma_atom_postact
-        mPostAct_mnl = params.mPostAct_mnl
-        sRowVec, sColVec, sPostAct = epi_smem_tensors
         get_smem_store_op = (
             partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
             if self.arch == 100
@@ -194,131 +96,56 @@ class GemmActMixin(GemmDefaultEpiMixin):
         copy_atom_postact_r2s = get_smem_store_op(
             self.postact_layout, self.postact_dtype, self.acc_dtype
         )
-        # tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
-        # tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
         tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
         tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
-        (tma_desc_postact_ptr,) = tma_desc_epi_ptrs
         batch_idx = tile_coord_mnkl[3]
         copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
-            tma_atom_postact,
-            varlen_manager.offset_batch_epi(mPostAct_mnl, batch_idx),
             self.cta_tile_shape_postact_mn,
-            params.epi_tile_postact,
             sPostAct,
             tile_coord_mnkl,
-            tma_desc_ptr=tma_desc_postact_ptr,
-        )
-        # We iterate over epi tiles in the N dimension first before the M dimension
-        epi_tile_shape = cute.zipped_divide(
-            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
-        ).shape[1]
-        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
-        epi_tile_num = cute.size(epi_tile_shape)
-        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
-        epi_tensors = self.epi_begin(
-            params,
-            epi_smem_tensors,
-            epi_tile,
-            tiled_copy_t2r,
-            tiled_copy_r2s,
-            tile_coord_mnkl,
-            varlen_manager,
-            epilogue_barrier,
-            tidx,
         )
-        if const_expr(copy_C is not None):
-            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
-                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
-                if is_tma_warp:
-                    epi_pipeline.producer_acquire(epi_producer_state)
-                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
-                    epi_pipeline.producer_commit(epi_producer_state)
-                epi_producer_state.advance()
-        def tma_store_fn(src_idx, dst_idx):
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                if const_expr(has_D):
-                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
-                copy_postact(src_idx=src_idx, dst_idx=dst_idx)
-            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
-            epilogue_barrier.arrive_and_wait()
-        delay_tma_store = True
-        src_idx_prev, dst_idx_prev = None, None
-        for epi_idx in cutlass.range_constexpr(epi_tile_num):
-            # The global memory coordinate for the current epi tile
-            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
-            # Copy from acc to D registers
-            load_acc_subtile(tRS_rD, epi_idx)
-            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
-            if const_expr(has_C):
-                epi_pipeline.consumer_wait(epi_read_state)
-                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
-                # Fence to make sure shared memory read is visible to TMA load
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
                 )
-                cute.arch.sync_warp()
-                with cute.arch.elect_one():
-                    epi_pipeline.consumer_release(epi_read_state)
-                epi_read_state.advance()
-            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
-                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
-                if is_tma_warp:
-                    epi_pipeline.producer_acquire(epi_producer_state)
-                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
-                    epi_pipeline.producer_commit(epi_producer_state)
-                epi_producer_state.advance()
-            tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
-            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
-            if const_expr(delay_tma_store):
-                if const_expr(epi_idx > 0):
-                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
-                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
-            # Copy from D registers to shared memory
-            if const_expr(has_D):
-                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
-            cute.copy(
-                tiled_copy_postact_r2s,
-                tiled_copy_postact_r2s.retile(tRS_rPostAct),
-                tRS_sPostAct[None, None, None, epi_buffer],
             )
-            if const_expr(not delay_tma_store):
-                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
-        if const_expr(delay_tma_store):
-            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
-        self.epi_end(
-            params,
-            epi_tensors,
-            epi_tile,
-            tiled_copy_t2r,
-            tiled_copy_r2s,
-            tile_coord_mnkl,
-            varlen_manager,
-            tidx,
-        )
-        return epi_read_state, epi_producer_state
     @cute.jit
     def epi_visit_subtile(
         self,
-        params: EpilogueParams,
         epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
@@ -327,7 +154,7 @@ class GemmActMixin(GemmDefaultEpiMixin):
         # Apply activation function if provided
         # If we don't have .shape here, the compiler generates local stores and loads
         if const_expr(params.act_fn is not None):
-            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
             if const_expr(self.arch < 100):
                 for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
                     tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
@@ -338,10 +165,7 @@ class GemmActMixin(GemmDefaultEpiMixin):
                     )
         else:
             tRS_rPostAct = tRS_rD
-        # Type conversion
-        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
-        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
-        return tRS_rPostAct_out
 class GemmActSm90(GemmActMixin, GemmSm90):
@@ -352,12 +176,202 @@ class GemmActSm100(GemmActMixin, GemmSm100):
     pass
-act_fn_map = {
-    None: None,
-    "relu": activation.relu,
-    "relu_sq": activation.relu_sq,
-    "gelu_tanh_approx": activation.gelu_tanh_approx,
-}
 def gemm_act(
@@ -365,7 +379,7 @@ def gemm_act(
     B: Tensor,  # (l, n, k)
     D: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
     C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
-    PostAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
     tile_count_semaphore: Optional[Tensor],  # (1,)
     activation: Optional[str],
     tile_M: int,
@@ -374,137 +388,132 @@ def gemm_act(
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
     max_swizzle_size: int = 8,
     rowvec_bias: Optional[Tensor] = None,  # (l, n)
     colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
     cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
     A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
 ) -> None:
-    if cu_seqlens_m is not None:
         assert persistent, "varlen_m requires persistent=True"
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         if D is not None:
             assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
         assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
-    gather_A = A_idx is not None
     if gather_A:
-        assert cu_seqlens_m is not None, "gather_A requires varlen (cu_seqlens_m must be specified)"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
-    assert activation in act_fn_map, f"Unsupported activation {activation}"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, D, C, additional_tensors={"PostAct": PostAct}, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx
-    )
-    GemmWrapperBase.permute_tensors(tensor_infos, varlen_m=cu_seqlens_m is not None)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-        "PostAct": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
     device_capacity = get_device_capacity(A.device)
-    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
-    GemmCls = GemmActSm100 if device_capacity[0] > 9 else GemmActSm90
-    acc_dtype = Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmCls.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    act_fn = act_fn_map[activation]
-    epi_args = GemmCls.EpilogueArguments(
-        tensor_infos["PostAct"].cute_tensor,
-        act_fn,
-        mRowVecBroadcast=from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
-            leading_dim=1
-        )
-        if rowvec_bias is not None
-        else None,
-        mColVecBroadcast=from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
-            leading_dim=1 if cu_seqlens_m is None else 0
         )
-        if colvec_bias is not None
-        else None,
-    )
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
-    )
-    # Create varlen arguments if needed (assumes persistent=True when varlen_m)
-    varlen_args = GemmWrapperBase.create_varlen_args(
-        cu_seqlens_m,
-        None,  # cu_seqlens_k
-        A_idx,
-        max_active_clusters,
-        cluster_shape_mnk,
-        tensor_infos,
-        GemmCls.num_epi_tensormaps,
-        pingpong,
     )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        activation,
-        tile_shape_mn,
-        cluster_shape_mnk,
         pingpong,
         persistent,
-        tile_count_semaphore is not None,
         device_capacity,
-        max_swizzle_size,
-        rowvec_bias.dtype if rowvec_bias is not None else None,
-        colvec_bias.dtype if colvec_bias is not None else None,
-        cu_seqlens_m is not None,
-        A_idx is not None,
-        key_tensor_names=("A", "B", "D", "PostAct", "C"),
     )
-    cache = gemm_act.compile_cache
-    if compile_key not in cache:
-        if device_capacity[0] == 9:
-            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
-        gemm_obj = GemmCls(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            gather_A=gather_A,
-        )
-        cache[compile_key] = cute.compile(
-            gemm_obj,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            varlen_args,
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        varlen_args,
-        current_stream,
     )
-gemm_act.compile_cache = {}

 # Copyright (c) 2025, Wentao Guo, Tri Dao.
+from __future__ import annotations
+from typing import NamedTuple, Tuple, Optional, Callable
 from functools import partial
 from torch import Tensor
 import cutlass.cute as cute
 import cutlass.utils.hopper_helpers as sm90_utils_og
 import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass import Int32, Float32, const_expr
+from cutlass.cute.runtime import make_ptr
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import (
+    ParamsBase,
+    mlir_namedtuple,
+    get_device_capacity,
+    get_max_active_clusters,
+    torch2cute_dtype_map,
+)
+from .epi_ops import TileStore
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
 from .gemm_default_epi import GemmDefaultEpiMixin
+from .gemm_tvm_ffi_utils import (
+    get_major,
+    perm3d_single,
+    make_scheduler_args,
+    make_varlen_args,
+    make_fake_scheduler_args,
+    make_fake_varlen_args,
+    div_for_dtype,
+    make_fake_gemm_tensors,
+    compile_gemm_kernel,
+)
+from .cache_utils import jit_cache
+from . import layout_utils as layout_utils
+from .layout_utils import permute_gated_Cregs_b16
+from .activation import act_fn_map, gate_fn_map
+from .rounding import RoundingMode
 class GemmActMixin(GemmDefaultEpiMixin):
+    _epi_ops = (*GemmDefaultEpiMixin._epi_ops, TileStore("mPostAct"))
+    _extra_param_fields = (("act_fn", cutlass.Constexpr, None),)
+    _epi_param_bases = (ParamsBase,)
+    @mlir_namedtuple
+    class EpilogueArguments(NamedTuple):
         mPostAct: cute.Tensor
         act_fn: cutlass.Constexpr[Optional[Callable]] = None
         alpha: Optional[Float32 | cute.Tensor] = None
         beta: Optional[Float32 | cute.Tensor] = None
         mRowVecBroadcast: Optional[cute.Tensor] = None
         mColVecBroadcast: Optional[cute.Tensor] = None
+        rounding_mode: cutlass.Constexpr[int] = RoundingMode.RN
+        sr_seed: Optional[Int32 | cute.Tensor] = None
+    # EpilogueParams auto-generated from _epi_ops + _extra_param_fields
+    def epi_to_underlying_arguments(self, args: EpilogueArguments, *, loc=None, ip=None):
+        self.rounding_mode = args.rounding_mode
         self.postact_dtype = args.mPostAct.element_type
         self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
         self.cta_tile_shape_postact_mn = self.cta_tile_shape_mnk[:2]
+        d = self._epi_ops_to_params_dict(args)
+        d["act_fn"] = args.act_fn
+        for key in ("mRowVecBroadcast", "mColVecBroadcast"):
+            if key in self.concat_layout and key in d and d[key] is not None:
+                d[key] = layout_utils.concat_to_interleave(d[key], 1)
+        return self.EpilogueParams(**d)
+    # epi_get_tma_atoms, epi_smem_bytes_per_stage, epi_get_smem_struct,
+    # epi_get_smem_tensors are all inherited from ComposableEpiMixin via _epi_ops.
+    def epi_setup_postact(
         self,
+        params,
+        epi_smem_tensors,
+        tiled_copy_r2s,
+        tiled_copy_t2r,
+        tile_coord_mnkl,
+        varlen_manager,
+        tidx,
+    ):
+        """Setup postact TMA copies and partitions before the epilogue loop."""
+        sPostAct = epi_smem_tensors[self._epi_smem_map["mPostAct"]]
         get_smem_store_op = (
             partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
             if self.arch == 100
         copy_atom_postact_r2s = get_smem_store_op(
             self.postact_layout, self.postact_dtype, self.acc_dtype
         )
         tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
         tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
         batch_idx = tile_coord_mnkl[3]
         copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
+            params.tma_atom_mPostAct,
+            varlen_manager.offset_batch_epi(params.mPostAct, batch_idx),
             self.cta_tile_shape_postact_mn,
+            params.epi_tile_mPostAct,
             sPostAct,
             tile_coord_mnkl,
         )
+        return tiled_copy_postact_r2s, tRS_sPostAct, copy_postact
+    @cute.jit
+    def epi_convert_postact(
+        self, tRS_rPostAct, sr_seed, tidx, tile_coord_mnkl, num_prev_subtiles, epi_idx
+    ):
+        """Convert postact from acc_dtype to postact_dtype. Override for custom postprocessing."""
+        if const_expr(
+            self.rounding_mode == RoundingMode.RS
+            and tRS_rPostAct.element_type == cutlass.Float32
+            and self.postact_dtype == cutlass.BFloat16
+        ):
+            from .rounding import convert_f32_to_bf16_sr
+            from cutlass.cute.tensor import TensorSSA
+            # Salt with 0x9E3779B1 to avoid sharing entropy with the D output seed
+            seed = (
+                sr_seed
+                + 0x9E3779B1
+                + (
+                    tile_coord_mnkl[0] * 65537
+                    + tile_coord_mnkl[1] * 257
+                    + tile_coord_mnkl[3] * 17
+                    + (num_prev_subtiles + epi_idx) * 7
                 )
             )
+            tRS_rPostAct_out = cute.make_rmem_tensor_like(tRS_rPostAct, self.postact_dtype)
+            src_vec = tRS_rPostAct.load()
+            raw_vec = convert_f32_to_bf16_sr(src_vec, seed, tidx)
+            tRS_rPostAct_out.store(TensorSSA(raw_vec, src_vec.shape, self.postact_dtype))
+        else:
+            tRS_rPostAct_out = cute.make_rmem_tensor_like(tRS_rPostAct, self.postact_dtype)
+            tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
     @cute.jit
     def epi_visit_subtile(
         self,
+        params,
         epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
         # Apply activation function if provided
         # If we don't have .shape here, the compiler generates local stores and loads
         if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_rmem_tensor(tRS_rD.layout.shape, self.acc_dtype)
             if const_expr(self.arch < 100):
                 for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
                     tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
                     )
         else:
             tRS_rPostAct = tRS_rD
+        return tRS_rPostAct
 class GemmActSm90(GemmActMixin, GemmSm90):
     pass
+class GemmActSm120(GemmActMixin, GemmSm120):
+    pass
+def _gated_epi_tile_fn(gemm, epi_tile):
+    """Halve the N dimension of the epi_tile for gated postact."""
+    if isinstance(epi_tile[1], cute.Layout):
+        return (epi_tile[0], cute.recast_layout(2, 1, epi_tile[1]))
+    return (epi_tile[0], epi_tile[1] // 2)
+class GemmGatedMixin(GemmActMixin):
+    _epi_ops = (
+        *GemmDefaultEpiMixin._epi_ops,
+        TileStore("mPostAct", epi_tile_fn=_gated_epi_tile_fn),
+    )
+    def epi_to_underlying_arguments(
+        self, args: GemmActMixin.EpilogueArguments, *, loc=None, ip=None
+    ) -> GemmActMixin.EpilogueParams:
+        assert args.mPostAct.element_type.width == 16, (
+            "GemmGated only supports 16bit postact for now"
+        )
+        assert self.d_layout is None or self.d_layout.is_n_major_c()
+        assert cutlass.utils.LayoutEnum.from_tensor(args.mPostAct).is_n_major_c()
+        if self.arch == 90:
+            assert self.cta_tile_shape_mnk[1] % 32 == 0, (
+                "GemmGatedSm90 requires tileN to be divisible by 32"
+            )
+        self.rounding_mode = args.rounding_mode
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        self.cta_tile_shape_postact_mn = (
+            self.cta_tile_shape_mnk[0],
+            self.cta_tile_shape_mnk[1] // 2,
+        )
+        d = self._epi_ops_to_params_dict(args)
+        d["act_fn"] = args.act_fn
+        for key in ("mRowVecBroadcast", "mColVecBroadcast"):
+            if key in self.concat_layout and key in d and d[key] is not None:
+                d[key] = layout_utils.concat_to_interleave(d[key], 1)
+        return self.EpilogueParams(**d)
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC)
+        tRS_rPostAct_layout = cute.recast_layout(2, 1, tRS_rD.layout)
+        # If we don't have .shape here, the compiler generates local stores and loads
+        tRS_rPostAct = cute.make_rmem_tensor(tRS_rPostAct_layout.shape, self.acc_dtype)
+        if const_expr(self.arch < 100):
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rPostAct[i] = params.act_fn(tRS_rD[2 * i], tRS_rD[2 * i + 1])
+        else:
+            for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1] = params.act_fn(
+                    (tRS_rD[4 * i], tRS_rD[4 * i + 2]), (tRS_rD[4 * i + 1], tRS_rD[4 * i + 3])
+                )
+        return tRS_rPostAct
+    @cute.jit
+    def epi_convert_postact(
+        self, tRS_rPostAct, sr_seed, tidx, tile_coord_mnkl, num_prev_subtiles, epi_idx
+    ):
+        tRS_rPostAct_out = GemmActMixin.epi_convert_postact(
+            self, tRS_rPostAct, sr_seed, tidx, tile_coord_mnkl, num_prev_subtiles, epi_idx
+        )
+        if const_expr(self.arch == 90):
+            # Only need this if we're using STSM
+            permute_gated_Cregs_b16(tRS_rPostAct_out)
+        return tRS_rPostAct_out
+class GemmGatedSm90(GemmGatedMixin, GemmSm90):
+    pass
+class GemmGatedSm100(GemmGatedMixin, GemmSm100):
+    pass
+class GemmGatedSm120(GemmGatedMixin, GemmSm120):
+    pass
+@jit_cache
+def _compile_gemm_act(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    postact_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    postact_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    activation,
+    rowvec_dtype,
+    colvec_dtype,
+    colvec_ndim,
+    varlen_m,
+    gather_A,
+    concat_layout,
+    device_capacity,
+    gemm_cls_name,
+    rounding_mode=RoundingMode.RN,
+    sr_seed_mode=0,
+    use_tma_gather=False,
+):
+    sm_to_cls = {
+        "act": {9: GemmActSm90, 10: GemmActSm100, 11: GemmActSm100, 12: GemmActSm120},
+        "gated": {9: GemmGatedSm90, 10: GemmGatedSm100, 11: GemmGatedSm100, 12: GemmGatedSm120},
+    }
+    if device_capacity[0] == 12 and gemm_cls_name == "act":
+        raise NotImplementedError("SM120 non-gated activation GEMM epilogue is not yet supported")
+    GemmCls = sm_to_cls[gemm_cls_name][device_capacity[0]]
+    pa_leading = 1 if postact_major == "n" else 0
+    mA, mB, mD, mC, m, n, k, l = make_fake_gemm_tensors(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        varlen_m=varlen_m,
+        gather_A=gather_A,
+    )
+    pa_n = cute.sym_int() if gemm_cls_name == "gated" else n
+    div_pa = div_for_dtype(postact_dtype)
+    pa_leading_dim = 1 if gemm_cls_name == "gated" else pa_leading
+    pa_shape = (m, pa_n) if varlen_m else (m, pa_n, l)
+    mPostAct = fake_tensor(postact_dtype, pa_shape, leading_dim=pa_leading_dim, divisibility=div_pa)
+    mRowVec = fake_tensor(rowvec_dtype, (l, n), leading_dim=1, divisibility=4)
+    if colvec_ndim == 2:
+        mColVec = fake_tensor(colvec_dtype, (l, m), leading_dim=1, divisibility=4)
+    elif colvec_ndim == 1:
+        mColVec = fake_tensor(colvec_dtype, (m,), leading_dim=0, divisibility=4)
+    else:
+        mColVec = None
+    act_fn = act_fn_map[activation] if gemm_cls_name == "act" else gate_fn_map[activation]
+    def fake_scalar(mode, dtype=Int32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(0)
+        else:
+            return make_ptr(dtype, 0, cute.AddressSpace.gmem, assumed_align=4)
+    epi_args = GemmCls.EpilogueArguments(
+        mPostAct,
+        act_fn,
+        mRowVecBroadcast=mRowVec,
+        mColVecBroadcast=mColVec,
+        rounding_mode=rounding_mode,
+        sr_seed=fake_scalar(sr_seed_mode),
+    )
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), False, l
+    )
+    varlen_args = make_fake_varlen_args(varlen_m, False, gather_A, m if varlen_m else None)
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        gather_A,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        use_tma_gather=use_tma_gather,
+        concat_layout=concat_layout or None,
+    )
 def gemm_act(
     B: Tensor,  # (l, n, k)
     D: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
     C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    PostAct: Tensor,  # (l, m, n) or (total_m, n//2) if gated
     tile_count_semaphore: Optional[Tensor],  # (1,)
     activation: Optional[str],
     tile_M: int,
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
+    is_dynamic_persistent: bool = False,
     max_swizzle_size: int = 8,
     rowvec_bias: Optional[Tensor] = None,  # (l, n)
     colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
     cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
     A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int | Tensor = 0,
+    use_tma_gather: bool = False,
+    concat_layout: tuple | None = None,
 ) -> None:
+    if activation in gate_fn_map:
+        gemm_cls_name = "gated"
+    else:
+        assert activation in act_fn_map, f"Unsupported activation {activation}"
+        gemm_cls_name = "act"
+    varlen_m = cu_seqlens_m is not None
+    gather_A = A_idx is not None
+    if varlen_m:
         assert persistent, "varlen_m requires persistent=True"
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         if D is not None:
             assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
         assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
     if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
+    A_p = perm3d_single(A, varlen_m)
+    B_p = perm3d_single(B)
+    D_p = perm3d_single(D, varlen_m)
+    C_p = perm3d_single(C, varlen_m)
+    PostAct_p = perm3d_single(PostAct, varlen_m)
+    a_major = get_major(A_p, "m", "k")
+    b_major = get_major(B_p, "n", "k")
+    d_major = get_major(D_p, "m", "n") if D_p is not None else None
+    c_major = get_major(C_p, "m", "n") if C_p is not None else None
+    postact_major = get_major(PostAct_p, "m", "n")
+    a_dtype = torch2cute_dtype_map[A.dtype]
+    b_dtype = torch2cute_dtype_map[B.dtype]
+    d_dtype = torch2cute_dtype_map[D.dtype] if D is not None else None
+    c_dtype = torch2cute_dtype_map[C.dtype] if C is not None else None
+    postact_dtype = torch2cute_dtype_map[PostAct.dtype]
+    colvec_ndim = colvec_bias.ndim if colvec_bias is not None else 0
     device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if rounding_mode == RoundingMode.RS:
+        assert device_capacity[0] == 10, "Stochastic rounding (RoundingMode.RS) requires SM100"
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
         )
+    sr_seed_mode = (
+        2 if isinstance(sr_seed, Tensor) else (1 if rounding_mode == RoundingMode.RS else 0)
     )
+    concat_layout = tuple(sorted(concat_layout)) if concat_layout else ()
+    compiled_fn = _compile_gemm_act(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        postact_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        postact_major,
+        (tile_M, tile_N),
+        (cluster_M, cluster_N, 1),
         pingpong,
         persistent,
+        is_dynamic_persistent,
+        activation,
+        torch2cute_dtype_map[rowvec_bias.dtype] if rowvec_bias is not None else None,
+        torch2cute_dtype_map[colvec_bias.dtype] if colvec_bias is not None else None,
+        colvec_ndim,
+        varlen_m,
+        gather_A,
+        concat_layout,
         device_capacity,
+        gemm_cls_name,
+        rounding_mode=rounding_mode,
+        sr_seed_mode=sr_seed_mode,
+        use_tma_gather=use_tma_gather,
     )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    def scalar_arg(scalar, mode, dtype=Int32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(scalar)
+        else:
+            return scalar.data_ptr()
+    epi_args = GemmActMixin.EpilogueArguments(
+        PostAct_p,
+        None,  # act_fn is Constexpr, pass None at call time
+        mRowVecBroadcast=rowvec_bias,
+        mColVecBroadcast=colvec_bias,
+        rounding_mode=None,  # Constexpr, pass None at call time
+        sr_seed=scalar_arg(sr_seed, sr_seed_mode),
     )
+    scheduler_args = make_scheduler_args(
+        max_active_clusters,
+        max_swizzle_size,
+        tile_count_semaphore,
+    )
+    varlen_args = make_varlen_args(cu_seqlens_m, None, A_idx)
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None, None, None)
+    else:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None)
+gemm_gated = gemm_act

build/torch-cuda/quack/gemm_blockscaled_interface.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Copyright (c) 2026, Tri Dao.
+"""PyTorch-friendly interface for the SM100 MXFP8 blockscaled GEMM.
+Shape / layout conventions (matches torch.matmul, torch._scaled_mm, cuBLAS):
+  A:       (M, K)     or (L, M, K)       dtype float8_e4m3fn, K-contiguous (row-major)
+  B:       (K, N)     or (L, K, N)       dtype float8_e4m3fn, K-contiguous (col-major)
+  A_scale: (M, K/32)  or (L, M, K/32)    dtype float8_e8m0fnu, K-contiguous
+  B_scale: (K/32, N)  or (L, K/32, N)    dtype float8_e8m0fnu, K-contiguous
+  out:     (M, N)     or (L, M, N)       dtype bfloat16/float16, contiguous
+"K-contiguous" means stride 1 on the K axis. This matches how torchao/cuBLAS
+use `torch._scaled_mm(a, b.t(), ...)`:
+  - you store a weight as nn.Linear-style `W` of shape `(N, K)` row-major
+  - you pass `W.mT` (a zero-copy view of shape (K, N) with K-contig) as B
+The interface applies `.mT` internally to reach the `(N, K) K-major` layout
+the quack kernel consumes. No data is copied.
+"""
+from functools import lru_cache
+from typing import Optional, Tuple
+import torch
+from torch import Tensor
+import cutlass
+from .blockscaled_gemm_utils import (
+    ceil_div,
+    compile_blockscaled_gemm_tvm_ffi,
+    pack_scale_2d_to_blocked_contig,
+    scale_blocked_for_cublas,
+    scale_view_for_kernel,
+)
+from .gemm_default_epi import GemmDefaultSm100
+from .mx_utils import to_mx
+_SF_VEC_SIZE = 32
+_TORCH_TO_CUTLASS_D = {
+    torch.bfloat16: cutlass.BFloat16,
+    torch.float16: cutlass.Float16,
+    torch.float32: cutlass.Float32,
+}
+def _default_tiler_cluster(m: int, n: int) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    """Pick a reasonable default (mma_tiler_mn, cluster_shape_mn)."""
+    if m >= 512 and n >= 128:
+        return (256, 128), (2, 1)
+    return (128, 128), (1, 1)
+@lru_cache(maxsize=64)
+def _compile_cached(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    out_torch_dtype,
+    ab_dtype_cutlass,
+    sf_dtype_cutlass,
+):
+    """Compile kernel for a given (shape, dtype, tiler, cluster) and cache it."""
+    dev = torch.device("cuda")
+    rm = ceil_div(m, 128)
+    rn = ceil_div(n, 128)
+    rk = ceil_div(k // _SF_VEC_SIZE, 4)
+    # K-major: (l, m, k) contiguous, viewed as (m, k, l) strides (k, 1, m*k)
+    fake_mA = torch.empty(l, m, k, dtype=torch.float8_e4m3fn, device=dev).permute(1, 2, 0)
+    fake_mB = torch.empty(l, n, k, dtype=torch.float8_e4m3fn, device=dev).permute(1, 2, 0)
+    # N-major: (l, m, n) contiguous, viewed as (m, n, l) strides (n, 1, m*n)
+    fake_mD = torch.empty(l, m, n, dtype=out_torch_dtype, device=dev).permute(1, 2, 0)
+    fake_sc_A = torch.empty(l, rm, rk, 512, dtype=torch.float8_e8m0fnu, device=dev)
+    fake_sc_B = torch.empty(l, rn, rk, 512, dtype=torch.float8_e8m0fnu, device=dev)
+    fake_mSFA = scale_view_for_kernel(fake_sc_A, m, k // _SF_VEC_SIZE, l)
+    fake_mSFB = scale_view_for_kernel(fake_sc_B, n, k // _SF_VEC_SIZE, l)
+    return compile_blockscaled_gemm_tvm_ffi(
+        ab_dtype_cutlass,
+        sf_dtype_cutlass,
+        _SF_VEC_SIZE,
+        _TORCH_TO_CUTLASS_D[out_torch_dtype],
+        mma_tiler_mn,
+        cluster_shape_mn,
+        fake_mA,
+        fake_mB,
+        fake_mD,
+        fake_mSFA,
+        fake_mSFB,
+    )
+def _as_3d(x: Tensor, ndim_in: int) -> Tensor:
+    """Add a leading batch dim if input is 2D. Returns a view."""
+    if ndim_in == 2:
+        return x.unsqueeze(0)
+    return x
+def _to_kernel_layout(
+    A: Tensor,
+    B: Tensor,
+    A_scale: Tensor,
+    B_scale: Tensor,
+) -> Tuple[int, int, int, int, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, bool]:
+    """Normalize shapes/strides, validate, and repack scales. Returns
+    (m, n, k, l, mA_mkl, mB_nkl, sc_contig_A, sc_contig_B, sfa_view, sfb_view, was_2d).
+    A: (M,K) or (L,M,K) K-contig.  B: (K,N) or (L,K,N) K-contig.
+    A_scale: (M,K/32) or (L,M,K/32) K-contig.  B_scale: (K/32,N) or (L,K/32,N) K-contig.
+    """
+    assert A.dtype == torch.float8_e4m3fn, f"A dtype must be float8_e4m3fn, got {A.dtype}"
+    assert B.dtype == torch.float8_e4m3fn, f"B dtype must be float8_e4m3fn, got {B.dtype}"
+    assert A_scale.dtype == torch.float8_e8m0fnu
+    assert B_scale.dtype == torch.float8_e8m0fnu
+    was_2d = A.dim() == 2
+    # Flip B from (K,N) to (N,K) via .mT (zero-copy). User's B K-contig → .mT K-contig.
+    A3 = _as_3d(A, A.dim())  # (l, m, k) K-contig row-major expected
+    B3 = _as_3d(B, B.dim()).mT  # (l, n, k) K-contig (view) from (l, k, n)
+    l, m, k = A3.shape
+    l2, n, k2 = B3.shape
+    assert l == l2, f"batch mismatch: A={l}, B={l2}"
+    assert k == k2, f"K mismatch: A K={k}, B K={k2}"
+    assert k % _SF_VEC_SIZE == 0, f"K ({k}) must be divisible by {_SF_VEC_SIZE}"
+    assert A3.stride(-1) == 1, "A must be K-contiguous (stride 1 on K)"
+    assert B3.stride(-1) == 1, (
+        "B must be K-contiguous on its K axis (pass .mT of an (N,K) row-major tensor)"
+    )
+    sf_k = k // _SF_VEC_SIZE
+    as3 = _as_3d(A_scale, A_scale.dim())  # expected (l, m, sf_k) K-contig row-major
+    bs3 = _as_3d(B_scale, B_scale.dim()).mT  # (l, n, sf_k) K-contig (view) from (l, sf_k, n)
+    assert as3.stride(-1) == 1, "A_scale must be K-contiguous"
+    assert bs3.stride(-1) == 1, (
+        "B_scale must be K-contiguous on its K axis (pass .mT of an (N, K/32) row-major tensor)"
+    )
+    assert as3.shape == (l, m, sf_k), (
+        f"A_scale shape: expected (l={l},m={m},sf_k={sf_k}) K-contig, got {tuple(as3.shape)}"
+    )
+    assert bs3.shape == (l, n, sf_k), (
+        f"B_scale shape: expected .mT of (l={l},sf_k={sf_k},n={n}) -> ({l},{n},{sf_k}), got {tuple(bs3.shape)}"
+    )
+    # Force row-major contiguous for packer/kernel consumption.
+    # A3 / B3 are views — .contiguous() materializes (l,m,k) / (l,n,k) row-major.
+    A3_c = A3.contiguous()
+    B3_c = B3.contiguous()
+    # (l, m, k) -> (m, k, l) K-major view (no copy; strides (k, 1, m*k))
+    mA_mkl = A3_c.permute(1, 2, 0)
+    mB_nkl = B3_c.permute(1, 2, 0)
+    sc_contig_A = pack_scale_2d_to_blocked_contig(as3.contiguous())
+    sc_contig_B = pack_scale_2d_to_blocked_contig(bs3.contiguous())
+    sfa_view = scale_view_for_kernel(sc_contig_A, m, sf_k, l)
+    sfb_view = scale_view_for_kernel(sc_contig_B, n, sf_k, l)
+    return m, n, k, l, mA_mkl, mB_nkl, sc_contig_A, sc_contig_B, sfa_view, sfb_view, was_2d
+def mxfp8_gemm_out(
+    A: Tensor,
+    B: Tensor,
+    A_scale: Tensor,
+    B_scale: Tensor,
+    out: Tensor,
+    *,
+    mma_tiler_mn: Optional[Tuple[int, int]] = None,
+    cluster_shape_mn: Optional[Tuple[int, int]] = None,
+) -> None:
+    """MXFP8 blockscaled GEMM with pre-allocated output. See module doc for shape conventions."""
+    m, n, k, l, mA, mB, _scA, _scB, sfa, sfb, was_2d = _to_kernel_layout(A, B, A_scale, B_scale)
+    out_dtype = out.dtype
+    assert out_dtype in _TORCH_TO_CUTLASS_D, f"unsupported out dtype: {out_dtype}"
+    expected_out_shape = (m, n) if was_2d else (l, m, n)
+    assert tuple(out.shape) == expected_out_shape, (
+        f"out shape {tuple(out.shape)} != expected {expected_out_shape}"
+    )
+    assert out.is_contiguous(), "out must be contiguous"
+    # View caller's contiguous (M,N) or (L,M,N) as (M,N,L) N-major strided view, no copy.
+    out_3d = out.unsqueeze(0) if was_2d else out  # (l, m, n)
+    mD = out_3d.permute(1, 2, 0)  # (m, n, l), strides (n, 1, m*n)
+    if mma_tiler_mn is None or cluster_shape_mn is None:
+        tlr, clu = _default_tiler_cluster(m, n)
+        mma_tiler_mn = mma_tiler_mn or tlr
+        cluster_shape_mn = cluster_shape_mn or clu
+    if not GemmDefaultSm100.can_implement_blockscaled(
+        cutlass.Float8E4M3FN,
+        cutlass.Float8E8M0FNU,
+        _SF_VEC_SIZE,
+        _TORCH_TO_CUTLASS_D[out_dtype],
+        mma_tiler_mn,
+        cluster_shape_mn,
+        m,
+        n,
+        k,
+        l,
+        "k",
+        "k",
+        "n",
+    ):
+        raise ValueError(
+            f"unsupported config: m={m}, n={n}, k={k}, l={l}, "
+            f"tiler={mma_tiler_mn}, cluster={cluster_shape_mn}"
+        )
+    runner = _compile_cached(
+        m,
+        n,
+        k,
+        l,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        out_dtype,
+        cutlass.Float8E4M3FN,
+        cutlass.Float8E8M0FNU,
+    )
+    runner(mA, mB, mD, sfa, sfb)
+def mxfp8_gemm(
+    A: Tensor,
+    B: Tensor,
+    A_scale: Tensor,
+    B_scale: Tensor,
+    out: Optional[Tensor] = None,
+    out_dtype: torch.dtype = torch.bfloat16,
+    *,
+    mma_tiler_mn: Optional[Tuple[int, int]] = None,
+    cluster_shape_mn: Optional[Tuple[int, int]] = None,
+) -> Tensor:
+    """MXFP8 blockscaled GEMM. Allocates output if not provided."""
+    if out is None:
+        # A: (M,K) or (L,M,K); B: (K,N) or (L,K,N); out: (M,N) or (L,M,N)
+        if A.dim() == 2:
+            out_shape = (A.shape[0], B.shape[1])
+        else:
+            out_shape = (A.shape[0], A.shape[1], B.shape[2])
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    mxfp8_gemm_out(
+        A,
+        B,
+        A_scale,
+        B_scale,
+        out,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+    return out
+def mxfp8_quantize(x: Tensor) -> Tuple[Tensor, Tensor]:
+    """Quantize a (..., K) bf16/fp32 tensor to MXFP8. Returns (qdata, scale_2d)
+    in torchao-convention layout. Last dim (K) must be divisible by 32."""
+    assert x.shape[-1] % _SF_VEC_SIZE == 0, (
+        f"last dim ({x.shape[-1]}) must be divisible by {_SF_VEC_SIZE}"
+    )
+    return to_mx(x.contiguous(), _SF_VEC_SIZE)
+def mxfp8_gemm_quantize(
+    A: Tensor,
+    B: Tensor,
+    out: Optional[Tensor] = None,
+    out_dtype: torch.dtype = torch.bfloat16,
+    *,
+    mma_tiler_mn: Optional[Tuple[int, int]] = None,
+    cluster_shape_mn: Optional[Tuple[int, int]] = None,
+) -> Tensor:
+    """High-level: quantize bf16 A, B_as_NK to MXFP8, then run C = A @ B_as_NK.mT.
+    Inputs: A=(M,K)/(L,M,K), B_as_NK=(N,K)/(L,N,K) bf16/fp32. Quantization
+    scales along the last (K) dim. Returned output has shape (M,N)/(L,M,N)."""
+    A_q, A_sc = mxfp8_quantize(A)
+    B_q, B_sc = mxfp8_quantize(B)
+    # B_q, B_sc are (..., N, K) / (..., N, K/32). Flip to (..., K, N) / (..., K/32, N)
+    # K-contig zero-copy views to match the interface convention.
+    return mxfp8_gemm(
+        A_q,
+        B_q.mT,
+        A_sc,
+        B_sc.mT,
+        out=out,
+        out_dtype=out_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+    )
+def mxfp8_gemm_cublas(
+    A: Tensor,
+    B: Tensor,
+    A_scale: Tensor,
+    B_scale: Tensor,
+    out_dtype: torch.dtype = torch.bfloat16,
+) -> Tensor:
+    """Reference path via torch._scaled_mm. Requires l=1 (or 2D inputs)."""
+    m, n, k, l, _mA, _mB, sc_A, sc_B, _sfa, _sfb, was_2d = _to_kernel_layout(A, B, A_scale, B_scale)
+    assert l == 1, "torch._scaled_mm MXFP8 path is 2D only; pass 2D inputs or l=1"
+    # torch._scaled_mm: A=(M,K) row-major, B=(K,N) col-major (both K-contig) -- same layout user gave us.
+    a2d = A if A.dim() == 2 else A.squeeze(0)
+    b2d = B if B.dim() == 2 else B.squeeze(0)
+    sca = scale_blocked_for_cublas(sc_A, m, k // _SF_VEC_SIZE, 0)
+    scb = scale_blocked_for_cublas(sc_B, n, k // _SF_VEC_SIZE, 0)
+    out = torch._scaled_mm(
+        a2d,
+        b2d,
+        scale_a=sca,
+        scale_b=scb,
+        out_dtype=out_dtype,
+    )
+    return out if was_2d else out.unsqueeze(0)
+def mxfp8_gemm_ref(
+    A: Tensor,
+    B: Tensor,
+    A_scale: Tensor,
+    B_scale: Tensor,
+    out_dtype: torch.dtype = torch.bfloat16,
+) -> Tensor:
+    """Dequantize + plain matmul reference. A=(M,K), B=(K,N)."""
+    was_2d = A.dim() == 2
+    # (l, m, k)
+    A3 = _as_3d(A, A.dim()).float()
+    # B is (K, N)/(L, K, N); flip to (l, n, k) for dequant by last-dim
+    B3 = _as_3d(B, B.dim()).mT.contiguous().float()
+    as3 = _as_3d(A_scale, A_scale.dim()).float()
+    bs3 = _as_3d(B_scale, B_scale.dim()).mT.contiguous().float()
+    a_dq = A3 * as3.repeat_interleave(_SF_VEC_SIZE, dim=-1)
+    b_dq = B3 * bs3.repeat_interleave(_SF_VEC_SIZE, dim=-1)
+    out3 = torch.einsum("lmk,lnk->lmn", a_dq, b_dq).to(out_dtype)
+    return out3.squeeze(0) if was_2d else out3

build/torch-cuda/quack/gemm_config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Copyright (C) 2025, Fri Dao.
 import itertools
-from typing import Optional, List, Literal
 from functools import partial
 from dataclasses import dataclass
@@ -10,86 +10,145 @@ class GemmConfig:
     tile_m: int = 128
     tile_n: int = 192
     pingpong: bool = True
     cluster_m: int = 2
     cluster_n: int = 1
     swap_ab: bool = False
     # raster_order: int = 1
     max_swizzle_size: int = 8
-def get_all_configs(
-    device_capacity: Literal[9, 10] = 9,
     epilogue: Optional[str] = None,
     tune_coop: bool = True,
-    # tune_raster_order=True,
 ) -> List[GemmConfig]:
-    assert device_capacity in [9, 10]
-    if device_capacity == 9:
-        tile_n_vals = [128, 144, 160, 176, 192, 208]
-        tile_mn_coop_vals = [(256, tile_n) for tile_n in tile_n_vals] + [
-            (128, 224),
-            (128, 256),
-            # (192, 256),  # Getting IOT instruction (core dumped) in the bwd
-        ]
-        tile_mn_pingpong_vals = [(128, tile_n) for tile_n in tile_n_vals] + [(192, 128)]
-        if epilogue in ["gated"]:
-            tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if n % 32 == 0 and m != 192]
-            tile_mn_pingpong_vals = [(m, n) for m, n in tile_mn_pingpong_vals if n % 32 == 0]
-        elif epilogue in ["lse"]:
-            tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if m != 192]
-        tile_mn_vals = []
-        if tune_coop:
-            tile_mn_vals += [(m, n, False) for m, n in tile_mn_coop_vals]
-        tile_mn_vals += [(m, n, True) for m, n in tile_mn_pingpong_vals]
         cluster = [(1, 2), (2, 1)]
-        # cluster = [(1, 1), (1, 2), (2, 1)]
-        if epilogue in ["lse"]:
-            cluster = [(1, 2), (2, 1)]
-        swap_ab_vals = [False, True]
-        if epilogue in ["lse", "gated"]:
-            swap_ab_vals = [False]
-        # raster_swizzle = (
-        #     [(0, 1)]
-        #     if not tune_raster_order
-        #     else [(1, 1), (1, 2), (1, 4), (1, 8), (2, 1), (2, 2), (2, 4), (2, 8)]
-        # )
-        return [
-            GemmConfig(
-                tile_m=tile_m,
-                tile_n=tile_n,
-                pingpong=pingpong,
-                cluster_m=cluster_m,
-                cluster_n=cluster_n,
-                swap_ab=swap_ab,
-                # raster_order=raster_order,
-                # max_swizzle_size=max_swizzle_size,
-            )
-            for (tile_m, tile_n, pingpong), (cluster_m, cluster_n), swap_ab in itertools.product(
-                tile_mn_vals,
-                cluster,
-                swap_ab_vals,
-                # raster_swizzle,
-            )
-        ]
-    elif device_capacity == 10:
-        tile_n_vals = [128, 160, 192, 224, 256]
-        tile_n_64_vals = [128, 192, 256]
-        tile_mn_cluster_vals = (
-            [(128, tile_n, (1, 2)) for tile_n in tile_n_vals]
-            # + [(128, tile_n, (2, 1)) for tile_n in tile_n_64_vals]
-            + [(128, tile_n, (2, 1)) for tile_n in tile_n_vals]
-            + [(256, tile_n, (2, 1)) for tile_n in tile_n_vals]
         )
-        swap_ab_vals = [False, True]
-        if epilogue in ["lse", "gated"]:
-            swap_ab_vals = [False]
-        max_swizzle_size_vals = [4, 8, 16]
-        GemmConfigCls = partial(GemmConfig, pingpong=False)  # There's no pingpong on Sm100
-        return [
-            GemmConfigCls(
-                tile_m=m, tile_n=n, cluster_m=cm, cluster_n=cn, swap_ab=sab, max_swizzle_size=ms
-            )
-            for (m, n, (cm, cn)), sab, ms in itertools.product(
-                tile_mn_cluster_vals, swap_ab_vals, max_swizzle_size_vals
-            )
-        ]

 # Copyright (C) 2025, Fri Dao.
 import itertools
+from typing import Optional, List
 from functools import partial
 from dataclasses import dataclass
     tile_m: int = 128
     tile_n: int = 192
     pingpong: bool = True
+    # by default, we use dynamic persistent tile scheduler on SM100 but not on SM90
+    is_dynamic_persistent: bool = True
     cluster_m: int = 2
     cluster_n: int = 1
     swap_ab: bool = False
     # raster_order: int = 1
     max_swizzle_size: int = 8
+    device_capacity: int = 9
+    # whether to use TMA gather (vs normal cp.async) for gather_A on SM100
+    use_tma_gather: bool = False
+def _get_sm90_configs(
     epilogue: Optional[str] = None,
     tune_coop: bool = True,
 ) -> List[GemmConfig]:
+    tile_n_vals = [128, 160, 192, 208]
+    tile_mn_vals_coop = [(256, tile_n) for tile_n in tile_n_vals] + [
+        (128, 224),
+        (128, 256),
+        # (192, 256),  # Getting IOT instruction (core dumped) in the bwd
+    ]
+    tile_mn_vals_pingpong = [(128, tile_n) for tile_n in tile_n_vals] + [(192, 128)]
+    if epilogue in ["gated"]:
+        tile_mn_vals_coop = [(m, n) for m, n in tile_mn_vals_coop if n % 32 == 0 and m != 192]
+        tile_mn_vals_pingpong = [(m, n) for m, n in tile_mn_vals_pingpong if n % 32 == 0]
+    elif epilogue in ["lse"]:
+        tile_mn_vals_coop = [(m, n) for m, n in tile_mn_vals_coop if m != 192]
+    tile_mn_vals = []
+    if tune_coop:
+        tile_mn_vals += [(m, n, False) for m, n in tile_mn_vals_coop]
+    tile_mn_vals += [(m, n, True) for m, n in tile_mn_vals_pingpong]
+    cluster = [(1, 2), (2, 1)]
+    # cluster = [(1, 1), (1, 2), (2, 1)]
+    if epilogue in ["lse"]:
         cluster = [(1, 2), (2, 1)]
+    swap_ab_vals = [False, True]
+    if epilogue in ["lse", "gated"]:
+        swap_ab_vals = [False]
+    return [
+        GemmConfig(
+            tile_m=tile_m,
+            tile_n=tile_n,
+            pingpong=pingpong,
+            cluster_m=cluster_m,
+            cluster_n=cluster_n,
+            swap_ab=swap_ab,
+            device_capacity=9,
+            is_dynamic_persistent=False,  # default to not use dynamic persistent on SM90
+            use_tma_gather=False,  # TMA gather not supported on SM90
+        )
+        for (tile_m, tile_n, pingpong), (cluster_m, cluster_n), swap_ab in itertools.product(
+            tile_mn_vals,
+            cluster,
+            swap_ab_vals,
+        )
+    ]
+def _get_sm100_configs(
+    epilogue: Optional[str] = None,
+) -> List[GemmConfig]:
+    tile_n_vals = [64, 128, 160, 192, 224, 256]
+    tile_mn_cluster_vals = (
+        [(128, tile_n, (1, 1)) for tile_n in tile_n_vals]
+        + [(128, tile_n, (1, 2)) for tile_n in tile_n_vals]
+        + [(128, tile_n, (2, 1)) for tile_n in tile_n_vals]
+        + [(128, tile_n, (2, 2)) for tile_n in tile_n_vals]
+        + [(256, tile_n, (2, 1)) for tile_n in tile_n_vals]
+        + [(256, tile_n, (2, 2)) for tile_n in tile_n_vals]
+        + [(256, 512, (2, 1))]
+    )
+    swap_ab_vals = [False, True]
+    if epilogue in ["lse", "gated"]:
+        swap_ab_vals = [False]
+    GemmConfigCls = partial(
+        GemmConfig, pingpong=False, device_capacity=10
+    )  # There's no pingpong on Sm100
+    use_clc_vals = [True, False]
+    use_tma_gather_vals = [True, False]
+    return [
+        GemmConfigCls(
+            tile_m=m,
+            tile_n=n,
+            cluster_m=cm,
+            cluster_n=cn,
+            swap_ab=sab,
+            max_swizzle_size=8,
+            is_dynamic_persistent=use_clc,
+            use_tma_gather=use_tma_gather,
+        )
+        for (m, n, (cm, cn)), sab, use_clc, use_tma_gather in itertools.product(
+            tile_mn_cluster_vals, swap_ab_vals, use_clc_vals, use_tma_gather_vals
+        )
+    ]
+def _get_sm120_configs(
+    epilogue: Optional[str] = None,
+    tune_coop: bool = True,
+) -> List[GemmConfig]:
+    tile_mn_vals_coop = [(128, 128), (128, 64), (64, 128), (128, 160), (128, 192)]
+    tile_mn_vals_pingpong = [(128, 128), (128, 64), (64, 128), (128, 160)]
+    tile_mn_vals = []
+    if tune_coop:
+        tile_mn_vals += [(m, n, False) for m, n in tile_mn_vals_coop]
+    tile_mn_vals += [(m, n, True) for m, n in tile_mn_vals_pingpong]
+    swap_ab_vals = [False, True]
+    if epilogue in ["lse", "gated"]:
+        swap_ab_vals = [False]
+    return [
+        GemmConfig(
+            tile_m=tile_m,
+            tile_n=tile_n,
+            pingpong=pingpong,
+            cluster_m=1,
+            cluster_n=1,
+            swap_ab=swap_ab,
+            device_capacity=12,
+            is_dynamic_persistent=True,
+            use_tma_gather=False,  # TMA gather not supported on SM120
         )
+        for (tile_m, tile_n, pingpong), swap_ab in itertools.product(tile_mn_vals, swap_ab_vals)
+    ]
+def get_all_configs(
+    epilogue: Optional[str] = None,
+    tune_coop: bool = True,
+) -> List[GemmConfig]:
+    """Return autotuning configs for all supported device capabilities (sm90 + sm100 + sm120).
+    Each GemmConfig is tagged with its target device_capacity, so the caller can
+    filter at runtime based on the actual device. This avoids querying the device
+    (and initializing a CUDA context) at import time.
+    """
+    return (
+        _get_sm90_configs(epilogue, tune_coop)
+        + _get_sm100_configs(epilogue)
+        + _get_sm120_configs(epilogue, tune_coop)
+    )

build/torch-cuda/quack/gemm_dact.py CHANGED Viewed

@@ -1,33 +1,53 @@
-# Copyright (c) 2025, Tri Dao.
-from typing import Optional, Tuple
-from functools import partial
 from torch import Tensor
 import cutlass
 import cutlass.cute as cute
-from cutlass import Float32, const_expr
-import cutlass.torch as cutlass_torch
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
 from .gemm_default_epi import GemmDefaultEpiMixin
 from .gemm_act import GemmActMixin
-from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
-from .gemm_wrapper_utils import GemmWrapperBase
-from . import activation
 class GemmDActMixin(GemmActMixin):
     # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
     # and return 2 arguments (dx, out)
     EpilogueArguments = GemmActMixin.EpilogueArguments
-    EpilogueParams = GemmActMixin.EpilogueParams
     @cute.jit
     def epi_visit_subtile(
         self,
-        params: EpilogueParams,
         epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
@@ -35,11 +55,11 @@ class GemmDActMixin(GemmActMixin):
         assert tRS_rC is not None
         # We don't add C to the accumulator
         GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC=None)
-        tRS_rC_acc = cute.make_fragment_like(tRS_rC, self.acc_dtype)
         tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
         # If we don't have .shape here, the compiler generates local stores and loads
         if const_expr(params.act_fn is not None):
-            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
             if const_expr(self.arch < 100):
                 for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
                     tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
@@ -54,10 +74,7 @@ class GemmDActMixin(GemmActMixin):
                     )
         else:
             tRS_rPostAct = tRS_rC_acc
-        # Type conversion
-        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
-        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
-        return tRS_rPostAct_out
 class GemmDActSm90(GemmDActMixin, GemmSm90):
@@ -68,19 +85,283 @@ class GemmDActSm100(GemmDActMixin, GemmSm100):
     pass
-dact_fn_map = {
-    None: None,
-    "relu": activation.drelu,
-    "relu_sq": activation.drelu_sq,
-    "gelu_tanh_approx": activation.dgelu_tanh_approx,
-}
 def gemm_dact(
     A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m or (whatever, k) if gather_A with varlen_m
     B: Tensor,  # (l, n, k)
-    Out: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
-    PreAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
     PostAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
     tile_count_semaphore: Optional[Tensor],  # (1,)
     activation: Optional[str],
@@ -90,126 +371,138 @@ def gemm_dact(
     cluster_N: int,
     pingpong: bool = True,
     persistent: bool = True,
     max_swizzle_size: int = 8,
     cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
     A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
 ) -> None:
-    if cu_seqlens_m is not None:
         assert persistent, "varlen_m requires persistent=True"
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         assert Out.stride(-1) == 1, "varlen_m requires Out to be n-major"
         assert PreAct.stride(-1) == 1, "varlen_m requires PreAct to be n-major"
         assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
-    gather_A = A_idx is not None
     if gather_A:
-        assert cu_seqlens_m is not None, "gather_A requires varlen (cu_seqlens_m must be specified)"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
-    assert activation in dact_fn_map, f"Unsupported activation {activation}"
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A,
-        B,
-        Out,
-        PreAct,
-        additional_tensors={"PostAct": PostAct},
-        cu_seqlens_m=cu_seqlens_m,
-        A_idx=A_idx,
-    )
-    GemmWrapperBase.permute_tensors(tensor_infos, varlen_m=cu_seqlens_m is not None)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-        "PostAct": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
-    device_capacity = get_device_capacity(A.device)
-    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
-    GemmCls = GemmDActSm100 if device_capacity[0] > 9 else GemmDActSm90
-    acc_dtype = Float32
-    tile_shape_mn = (tile_M, tile_N)
-    cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmCls.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
-    act_fn = dact_fn_map[activation]
-    epi_args = GemmCls.EpilogueArguments(tensor_infos["PostAct"].cute_tensor, act_fn)
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
-    )
-    # Create varlen arguments if needed (assumes persistent=True when varlen_m)
-    varlen_args = GemmWrapperBase.create_varlen_args(
-        cu_seqlens_m,
-        None,  # cu_seqlens_k
-        A_idx,
-        max_active_clusters,
-        cluster_shape_mnk,
-        tensor_infos,
-        GemmCls.num_epi_tensormaps,
-        pingpong,
-    )
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        activation,
-        tile_shape_mn,
-        cluster_shape_mnk,
         pingpong,
         persistent,
-        tile_count_semaphore is not None,
         device_capacity,
-        max_swizzle_size,
-        cu_seqlens_m is not None,
-        A_idx is not None,
-        key_tensor_names=("A", "B", "D", "PostAct", "C"),
     )
-    cache = gemm_dact.compile_cache
-    if compile_key not in cache:
-        if device_capacity[0] == 9:
-            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
-        gemm = GemmCls(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            gather_A=gather_A,
         )
-        cache[compile_key] = cute.compile(
-            gemm,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            varlen_args,
-            current_stream,
         )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        varlen_args,
-        current_stream,
     )
-gemm_dact.compile_cache = {}

+# Copyright (c) 2025-2026, Tri Dao.
+from __future__ import annotations
+from typing import NamedTuple, Optional, Tuple, Callable
+import torch
 from torch import Tensor
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Float32, const_expr
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
 from .gemm_default_epi import GemmDefaultEpiMixin
 from .gemm_act import GemmActMixin
+from .epi_ops import ColVecReduce, colvec_reduce_accumulate
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import (
+    ParamsBase,
+    mlir_namedtuple,
+    torch2cute_dtype_map,
+    get_device_capacity,
+    get_max_active_clusters,
+)
+from .gemm_tvm_ffi_utils import (
+    get_major,
+    perm3d_single,
+    make_scheduler_args,
+    make_varlen_args,
+    make_fake_scheduler_args,
+    make_fake_varlen_args,
+    div_for_dtype,
+    make_fake_gemm_tensors,
+    compile_gemm_kernel,
+)
+from .cache_utils import jit_cache
+from .rounding import RoundingMode
+from . import layout_utils as layout_utils
+from .activation import dact_fn_map, dgate_fn_map
 class GemmDActMixin(GemmActMixin):
     # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
     # and return 2 arguments (dx, out)
     EpilogueArguments = GemmActMixin.EpilogueArguments
     @cute.jit
     def epi_visit_subtile(
         self,
+        params,
         epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
         assert tRS_rC is not None
         # We don't add C to the accumulator
         GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC=None)
+        tRS_rC_acc = cute.make_rmem_tensor_like(tRS_rC, self.acc_dtype)
         tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
         # If we don't have .shape here, the compiler generates local stores and loads
         if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_rmem_tensor(tRS_rD.layout.shape, self.acc_dtype)
             if const_expr(self.arch < 100):
                 for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
                     tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
                     )
         else:
             tRS_rPostAct = tRS_rC_acc
+        return tRS_rPostAct
 class GemmDActSm90(GemmDActMixin, GemmSm90):
     pass
+class GemmDActSm120(GemmDActMixin, GemmSm120):
+    pass
+class GemmDGatedMixin(GemmActMixin):
+    # Different from GemmActMixin, here act_bwd_fn must take in 3 arguments (x, y, dout)
+    # and return 3 arguments (dx, dy, out)
+    _epi_ops = (*GemmActMixin._epi_ops, ColVecReduce("mColVecReduce"))
+    _extra_param_fields = (("act_bwd_fn", cutlass.Constexpr, None),)
+    _epi_param_bases = (ParamsBase,)
+    @mlir_namedtuple
+    class EpilogueArguments(NamedTuple):
+        mPostAct: cute.Tensor
+        act_bwd_fn: cutlass.Constexpr[Callable] = None
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+        mColVecReduce: Optional[cute.Tensor] = None
+        rounding_mode: cutlass.Constexpr[int] = RoundingMode.RN
+        sr_seed: Optional[Int32 | cute.Tensor] = None
+    # EpilogueParams auto-generated from _epi_ops + _extra_param_fields
+    def epi_to_underlying_arguments(self, args: EpilogueArguments, *, loc=None, ip=None):
+        # C and D are implicitly 2 16-bit elements packed into 32 bits, simply for the purpose
+        # for reusing the existing load/store code.
+        assert self.implicit_dtype.width == 16, "GemmDGated only supports 16bit for now"
+        assert self.d_dtype.width == 32, "D storage type must be 32 bit"
+        assert self.c_dtype.width == 32, "C storage type must be 32 bit"
+        self.rounding_mode = args.rounding_mode
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        self.cta_tile_shape_postact_mn = self.cta_tile_shape_mnk[:2]
+        d = self._epi_ops_to_params_dict(args)
+        d["act_bwd_fn"] = args.act_bwd_fn
+        return self.EpilogueParams(**d)
+    # epi_begin, epi_begin_loop, epi_end are inherited from ComposableEpiMixin via _epi_ops.
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        alpha = epi_loop_tensors["alpha"]
+        beta = epi_loop_tensors["beta"]
+        tDrRowVec = epi_loop_tensors["mRowVecBroadcast"]
+        tDrColVec = epi_loop_tensors["mColVecBroadcast"]
+        tDrColVecReduce = epi_loop_tensors["mColVecReduce"]
+        assert alpha is None and beta is None and tDrRowVec is None  # We don't use these for now
+        assert tRS_rC is not None
+        implicit_dtype = self.implicit_dtype
+        assert implicit_dtype.width == 16, "GemmDGatedMixin only supports 16bit for now"
+        tRS_rXY_f16x2 = cute.recast_tensor(tRS_rC, implicit_dtype)
+        tRS_rXY_f32x2 = cute.make_rmem_tensor(tRS_rXY_f16x2.layout, Float32)
+        tRS_rXY_f32x2.store(tRS_rXY_f16x2.load().to(Float32))
+        tRS_rdXY_f32x2 = cute.make_rmem_tensor_like(tRS_rXY_f32x2, Float32)
+        tRS_rOut = cute.make_rmem_tensor_like(tRS_rD, Float32)
+        tRS_rD_scaled = cute.make_rmem_tensor_like(tRS_rD)
+        if const_expr(tDrColVec is not None):  # Scale D by colvec
+            if const_expr(self.arch < 100):
+                tRS_rD_scaled.store(tRS_rD.load() * tDrColVec.load().to(tRS_rD.element_type))
+            else:
+                tDrColVec_mn = layout_utils.convert_layout_zero_stride(tDrColVec, tDrColVec.layout)
+                tRS_rD_mn = layout_utils.convert_layout_zero_stride(tRS_rD, tDrColVec.layout)
+                tRS_rD_scaled_mn = layout_utils.convert_layout_zero_stride(
+                    tRS_rD_scaled, tDrColVec.layout
+                )
+                for m in cutlass.range(cute.size(tDrColVec_mn, mode=[0]), unroll_full=True):
+                    for n in cutlass.range(
+                        cute.size(tDrColVec_mn, mode=[1]) // 2, unroll_full=True
+                    ):
+                        (
+                            tRS_rD_scaled_mn[m, 2 * n],
+                            tRS_rD_scaled_mn[m, 2 * n + 1],
+                        ) = cute.arch.mul_packed_f32x2(
+                            (tRS_rD_mn[m, 2 * n], tRS_rD_mn[m, 2 * n + 1]),
+                            (tDrColVec_mn[m, 0], tDrColVec_mn[m, 0]),
+                        )
+        else:
+            tRS_rD_scaled.store(tRS_rD.load())
+        if const_expr(self.arch < 100):
+            for i in cutlass.range(cute.size(tRS_rD)):
+                (
+                    tRS_rdXY_f32x2[2 * i],
+                    tRS_rdXY_f32x2[2 * i + 1],
+                    tRS_rOut[i],
+                ) = params.act_bwd_fn(
+                    tRS_rXY_f32x2[2 * i], tRS_rXY_f32x2[2 * i + 1], tRS_rD_scaled[i]
+                )
+        else:
+            for i in cutlass.range(cute.size(tRS_rD) // 2):
+                (
+                    (tRS_rdXY_f32x2[4 * i], tRS_rdXY_f32x2[4 * i + 2]),
+                    (tRS_rdXY_f32x2[4 * i + 1], tRS_rdXY_f32x2[4 * i + 3]),
+                    (tRS_rOut[2 * i], tRS_rOut[2 * i + 1]),
+                ) = params.act_bwd_fn(
+                    (tRS_rXY_f32x2[4 * i], tRS_rXY_f32x2[4 * i + 2]),
+                    (tRS_rXY_f32x2[4 * i + 1], tRS_rXY_f32x2[4 * i + 3]),
+                    (tRS_rD_scaled[2 * i], tRS_rD_scaled[2 * i + 1]),
+                )
+        if const_expr(tDrColVecReduce is not None):
+            # Accumulate postact * dout before D is scaled by colvec_scale
+            colvec_reduce_accumulate(self, tDrColVecReduce, tRS_rOut, rScale=tRS_rD)
+        if const_expr(tDrColVec is not None):  # Scale Out by colvec
+            if const_expr(self.arch < 100):
+                tRS_rOut.store(tRS_rOut.load() * tDrColVec.load().to(tRS_rD.element_type))
+            else:
+                tDrColVec_mn = layout_utils.convert_layout_zero_stride(tDrColVec, tDrColVec.layout)
+                tRS_rOut_mn = layout_utils.convert_layout_zero_stride(tRS_rOut, tDrColVec.layout)
+                for m in cutlass.range(cute.size(tDrColVec_mn, mode=[0]), unroll_full=True):
+                    for n in cutlass.range(
+                        cute.size(tDrColVec_mn, mode=[1]) // 2, unroll_full=True
+                    ):
+                        tRS_rOut_mn[m, 2 * n], tRS_rOut_mn[m, 2 * n + 1] = (
+                            cute.arch.mul_packed_f32x2(
+                                (tRS_rOut_mn[m, 2 * n], tRS_rOut_mn[m, 2 * n + 1]),
+                                (tDrColVec_mn[m, 0], tDrColVec_mn[m, 0]),
+                            )
+                        )
+        # Type conversion
+        tRS_rdXY_f16x2 = cute.make_rmem_tensor(tRS_rdXY_f32x2.layout, implicit_dtype)
+        tRS_rdXY_f16x2.store(tRS_rdXY_f32x2.load().to(implicit_dtype))
+        tRS_rD.store(cute.recast_tensor(tRS_rdXY_f16x2, Float32).load())
+        return tRS_rOut
+    # epi_end is inherited from ComposableEpiMixin → delegates to ColVecReduce.end()
+class GemmDGatedSm90(GemmDGatedMixin, GemmSm90):
+    pass
+class GemmDGatedSm100(GemmDGatedMixin, GemmSm100):
+    pass
+class GemmDGatedSm120(GemmDGatedMixin, GemmSm120):
+    pass
+@jit_cache
+def _compile_gemm_dact(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    postact_dtype,
+    implicit_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    postact_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    activation,
+    colvec_scale_dtype,
+    colvec_scale_ndim,
+    colvec_reduce_dtype,
+    colvec_reduce_ndim,
+    varlen_m,
+    gather_A,
+    device_capacity,
+    gemm_cls_name,
+    use_tma_gather=False,
+):
+    is_dgated = gemm_cls_name == "dgated"
+    sm_to_cls = {
+        "dact": {9: GemmDActSm90, 10: GemmDActSm100, 11: GemmDActSm100, 12: GemmDActSm120},
+        "dgated": {
+            9: GemmDGatedSm90,
+            10: GemmDGatedSm100,
+            11: GemmDGatedSm100,
+            12: GemmDGatedSm120,
+        },
+    }
+    if device_capacity[0] == 12 and gemm_cls_name == "dact":
+        raise NotImplementedError("SM120 non-gated dactivation GEMM epilogue is not yet supported")
+    GemmCls = sm_to_cls[gemm_cls_name][device_capacity[0]]
+    mA, mB, mD, mC, m, n, k, l = make_fake_gemm_tensors(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        varlen_m=varlen_m,
+        gather_A=gather_A,
+    )
+    div_pa = div_for_dtype(postact_dtype)
+    pa_leading = 1 if postact_major == "n" else 0
+    pa_shape = (m, n) if varlen_m else (m, n, l)
+    mPostAct = fake_tensor(postact_dtype, pa_shape, leading_dim=pa_leading, divisibility=div_pa)
+    if is_dgated:
+        act_fn = dgate_fn_map[activation]
+        mColVec = None
+        if colvec_scale_ndim == 2:
+            mColVec = fake_tensor(colvec_scale_dtype, (l, m), leading_dim=1, divisibility=4)
+        elif colvec_scale_ndim == 1:
+            mColVec = fake_tensor(colvec_scale_dtype, (m,), leading_dim=0, divisibility=4)
+        mColVecReduce = None
+        n_tiles = cute.sym_int()
+        if colvec_reduce_ndim == 3:
+            mColVecReduce = fake_tensor(
+                colvec_reduce_dtype,
+                (l, m, n_tiles),
+                leading_dim=2,
+                divisibility=1,
+            )
+        elif colvec_reduce_ndim == 2:
+            mColVecReduce = fake_tensor(
+                colvec_reduce_dtype,
+                (m, n_tiles),
+                leading_dim=1,
+                divisibility=1,
+            )
+        epi_args = GemmCls.EpilogueArguments(
+            mPostAct,
+            act_fn,
+            mColVecBroadcast=mColVec,
+            mColVecReduce=mColVecReduce,
+        )
+        def _set_implicit_dtype(gemm_obj):
+            gemm_obj.implicit_dtype = implicit_dtype
+        post_init = _set_implicit_dtype
+    else:
+        act_fn = dact_fn_map[activation]
+        epi_args = GemmCls.EpilogueArguments(mPostAct, act_fn)
+        post_init = None
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), False, l
+    )
+    varlen_args = make_fake_varlen_args(varlen_m, False, gather_A, m if varlen_m else None)
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        gather_A,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        post_init=post_init,
+        use_tma_gather=use_tma_gather,
+    )
 def gemm_dact(
     A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m or (whatever, k) if gather_A with varlen_m
     B: Tensor,  # (l, n, k)
+    Out: Tensor,  # (l, m, n) or (total_m, n) if varlen_m; or (l, m, 2*n)/(total_m, 2*n) if dgated
+    PreAct: Tensor,  # same shape as Out
     PostAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
     tile_count_semaphore: Optional[Tensor],  # (1,)
     activation: Optional[str],
     cluster_N: int,
     pingpong: bool = True,
     persistent: bool = True,
+    is_dynamic_persistent: bool = False,
     max_swizzle_size: int = 8,
+    colvec_scale: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m (dgated only)
+    # (l, m, ceildiv(n, tile_n)), or (total_m, ceildiv(n, tile_n)) if varlen_m (dgated only)
+    colvec_reduce: Optional[Tensor] = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
     A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
+    use_tma_gather: bool = False,
 ) -> None:
+    is_dgated = activation in dgate_fn_map
+    if not is_dgated:
+        assert activation in dact_fn_map, f"Unsupported activation {activation}"
+        assert colvec_scale is None, "colvec_scale is only supported for gated activations"
+        assert colvec_reduce is None, "colvec_reduce is only supported for gated activations"
+    gemm_cls_name = "dgated" if is_dgated else "dact"
+    varlen_m = cu_seqlens_m is not None
+    gather_A = A_idx is not None
+    if varlen_m:
         assert persistent, "varlen_m requires persistent=True"
         assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
         assert Out.stride(-1) == 1, "varlen_m requires Out to be n-major"
         assert PreAct.stride(-1) == 1, "varlen_m requires PreAct to be n-major"
         assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
     if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen"
         assert cluster_N == 1, "gather_A requires cluster_N=1"
+    # For dgated, capture implicit_dtype before viewing Out/PreAct as f32
+    implicit_dtype = None
+    if is_dgated:
+        AB_swapped = Out.stride(-1) != 1
+        implicit_dtype = torch2cute_dtype_map[Out.dtype]
+        assert Out.element_size() == 2, "Out dtype must be fp16 or bf16"
+        assert PreAct.element_size() == 2, "Preact dtype must be fp16 or bf16"
+        if varlen_m or not AB_swapped:
+            Out = Out.view(torch.float32)
+            PreAct = PreAct.view(torch.float32)
+        else:
+            Out = Out.mT.view(torch.float32).mT
+            PreAct = PreAct.mT.view(torch.float32).mT
+    A_p = perm3d_single(A, varlen_m)
+    B_p = perm3d_single(B)
+    Out_p = perm3d_single(Out, varlen_m)
+    PreAct_p = perm3d_single(PreAct, varlen_m)
+    PostAct_p = perm3d_single(PostAct, varlen_m)
+    a_major = get_major(A_p, "m", "k")
+    b_major = get_major(B_p, "n", "k")
+    d_major = get_major(Out_p, "m", "n")
+    c_major = get_major(PreAct_p, "m", "n")
+    postact_major = get_major(PostAct_p, "m", "n")
+    a_dtype = torch2cute_dtype_map[A.dtype]
+    b_dtype = torch2cute_dtype_map[B.dtype]
+    d_dtype = torch2cute_dtype_map[Out.dtype]
+    c_dtype = torch2cute_dtype_map[PreAct.dtype]
+    postact_dtype = torch2cute_dtype_map[PostAct.dtype]
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
+        )
+    compiled_fn = _compile_gemm_dact(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        postact_dtype,
+        implicit_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        postact_major,
+        (tile_M, tile_N),
+        (cluster_M, cluster_N, 1),
         pingpong,
         persistent,
+        is_dynamic_persistent,
+        activation,
+        torch2cute_dtype_map[colvec_scale.dtype] if colvec_scale is not None else None,
+        colvec_scale.ndim if colvec_scale is not None else 0,
+        torch2cute_dtype_map[colvec_reduce.dtype] if colvec_reduce is not None else None,
+        colvec_reduce.ndim if colvec_reduce is not None else 0,
+        varlen_m,
+        gather_A,
         device_capacity,
+        gemm_cls_name,
+        use_tma_gather=use_tma_gather,
     )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    if is_dgated:
+        epi_args = GemmDGatedMixin.EpilogueArguments(
+            PostAct_p,
+            None,  # act_bwd_fn is Constexpr
+            mColVecBroadcast=colvec_scale,
+            mColVecReduce=colvec_reduce,
+            rounding_mode=None,
+            sr_seed=None,
         )
+    else:
+        epi_args = GemmDActMixin.EpilogueArguments(
+            PostAct_p,
+            None,
+            rounding_mode=None,
+            sr_seed=None,
         )
+    scheduler_args = make_scheduler_args(
+        max_active_clusters,
+        max_swizzle_size,
+        tile_count_semaphore,
     )
+    varlen_args = make_varlen_args(cu_seqlens_m, None, A_idx)
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(
+            A_p, B_p, Out_p, PreAct_p, epi_args, scheduler_args, varlen_args, None, None, None
+        )
+    else:
+        compiled_fn(A_p, B_p, Out_p, PreAct_p, epi_args, scheduler_args, varlen_args, None)
+gemm_dgated = gemm_dact

build/torch-cuda/quack/gemm_default_epi.py CHANGED Viewed

@@ -1,189 +1,62 @@
 # Copyright (c) 2025, Wentao Guo, Tri Dao.
-from typing import Optional, Tuple
-from functools import partial
-from dataclasses import dataclass
 import cutlass
 import cutlass.cute as cute
-from cutlass import Int32, Float32, Boolean, const_expr
-from .cute_dsl_utils import ArgumentsBase, ParamsBase
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
-from .sm90_utils import partition_for_epilogue
 from . import utils as utils
-from . import copy_utils as copy_utils
-from .varlen_utils import VarlenManager
-class GemmDefaultEpiMixin:
-    num_epi_tensormaps: int = 0
-    @dataclass
-    class EpilogueArguments(ArgumentsBase):
         alpha: Optional[Float32 | cute.Tensor] = None
         beta: Optional[Float32 | cute.Tensor] = None
         mRowVecBroadcast: Optional[cute.Tensor] = None
         mColVecBroadcast: Optional[cute.Tensor] = None
-        add_to_output: bool = False
-    @dataclass
-    class EpilogueParams(ParamsBase):
-        alpha: Optional[Float32 | cute.Tensor] = None
-        beta: Optional[Float32 | cute.Tensor] = None
-        mRowVecBroadcast: Optional[cute.Tensor] = None
-        mColVecBroadcast: Optional[cute.Tensor] = None
-    def epi_to_underlying_arguments(
-        self, args: EpilogueArguments, *, loc=None, ip=None
-    ) -> EpilogueParams:
-        # Assume all strides are divisible by 32 bits except the last stride
-        new_stride = lambda t: tuple(
-            cute.assume(s, divby=32 // t.element_type.width) if not cute.is_static(s) else s
-            for s in t.stride
-        )
-        mRowVecBroadcast, mColVecBroadcast = [
-            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
-            if t is not None
-            else None
-            for t in (args.mRowVecBroadcast, args.mColVecBroadcast)
-        ]
-        return self.EpilogueParams(
-            alpha=args.alpha,
-            beta=args.beta,
-            mRowVecBroadcast=mRowVecBroadcast,
-            mColVecBroadcast=mColVecBroadcast,
-        )
-    @cute.jit
-    def epi_begin(
-        self,
-        params: EpilogueParams,
-        epi_smem_tensors: Tuple[cute.Tensor, ...],
-        epi_tile: cute.Tile,
-        tiled_copy_t2r: Optional[cute.TiledCopy],
-        tiled_copy_r2s: cute.TiledCopy,
-        tile_coord_mnkl: cute.Coord,
-        varlen_manager: VarlenManager,
-        epilogue_barrier: cutlass.pipeline.NamedBarrier,
-        tidx: Int32,
-    ):
-        alpha, beta = None, None
-        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
-            alpha = utils.load_scalar_or_pointer(params.alpha)
-        if const_expr(hasattr(params, "beta") and params.beta is not None):
-            beta = utils.load_scalar_or_pointer(params.beta)
-        sRowVec, sColVec, *rest = epi_smem_tensors
-        tile_M, tile_N = self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[1]
-        batch_idx = tile_coord_mnkl[3]
-        num_epi_threads = self.num_epi_warps * cute.arch.WARP_SIZE
-        # Don't need sync as we assume the previous epilogue has finished
-        partition_for_epilogue_fn = partial(
-            partition_for_epilogue,
-            epi_tile=epi_tile,
-            tiled_copy=tiled_copy_t2r if tiled_copy_t2r is not None else tiled_copy_r2s,
-            tidx=tidx,
-            reference_src=tiled_copy_t2r is None,
-        )
-        tDsRowVec = None
-        if const_expr(params.mRowVecBroadcast is not None):
-            rowvec_dtype = params.mRowVecBroadcast.element_type
-            num_copy_elems = const_expr(max(32, rowvec_dtype.width)) // rowvec_dtype.width
-            thr_copy_RV = copy_utils.tiled_copy_1d(
-                params.mRowVecBroadcast.element_type, num_epi_threads, num_copy_elems, is_async=True
-            ).get_slice(tidx)
-            mRowVec = params.mRowVecBroadcast[batch_idx, None]
-            gRowVec = cute.local_tile(mRowVec, (tile_N,), (tile_coord_mnkl[1],))
-            tRVgRV = thr_copy_RV.partition_S(gRowVec)
-            tRVsRV = thr_copy_RV.partition_D(sRowVec)
-            tRVcRV = thr_copy_RV.partition_S(cute.make_identity_tensor(tile_N))
-            limit_n = min(mRowVec.shape[0] - tile_coord_mnkl[1] * tile_N, tile_N)
-            tRVpRV = cute.make_fragment((1, cute.size(tRVsRV.shape[1])), Boolean)
-            for m in cutlass.range(cute.size(tRVsRV.shape[1]), unroll_full=True):
-                tRVpRV[0, m] = tRVcRV[0, m] < limit_n
-            cute.copy(thr_copy_RV, tRVgRV, tRVsRV, pred=tRVpRV)
-            # (CPY, CPY_M, CPY_N, EPI_M, EPI_N)
-            tDsRowVec = partition_for_epilogue_fn(
-                cute.make_tensor(
-                    sRowVec.iterator, cute.make_layout((tile_M, tile_N), stride=(0, 1))
-                )
-            )
-            if const_expr(tiled_copy_t2r is not None):
-                tDsRowVec = tiled_copy_r2s.retile(tDsRowVec)
-        tDsColVec = None
-        if const_expr(params.mColVecBroadcast is not None):
-            colvec_dtype = params.mColVecBroadcast.element_type
-            num_copy_elems = const_expr(max(32, colvec_dtype.width)) // colvec_dtype.width
-            thr_copy_CV = copy_utils.tiled_copy_1d(
-                params.mColVecBroadcast.element_type, num_epi_threads, num_copy_elems, is_async=True
-            ).get_slice(tidx)
-            if const_expr(not varlen_manager.varlen_m):
-                mColVec = params.mColVecBroadcast[batch_idx, None]
-            else:
-                mColVec = cute.domain_offset(
-                    (varlen_manager.params.cu_seqlens_m[batch_idx],), params.mColVecBroadcast
-                )
-            gColVec = cute.local_tile(mColVec, (tile_M,), (tile_coord_mnkl[0],))
-            tCVgCV = thr_copy_CV.partition_S(gColVec)
-            tCVsCV = thr_copy_CV.partition_D(sColVec)
-            tCVcCV = thr_copy_CV.partition_S(cute.make_identity_tensor(tile_M))
-            limit_m = min(varlen_manager.len_m(batch_idx) - tile_coord_mnkl[0] * tile_M, tile_M)
-            tCVpCV = cute.make_fragment((1, cute.size(tCVsCV.shape[1])), Boolean)
-            for m in cutlass.range(cute.size(tCVsCV.shape[1]), unroll_full=True):
-                tCVpCV[0, m] = tCVcCV[0, m] < limit_m
-            cute.copy(thr_copy_CV, tCVgCV, tCVsCV, pred=tCVpCV)
-            tDsColVec = partition_for_epilogue_fn(
-                cute.make_tensor(
-                    sColVec.iterator, cute.make_layout((tile_M, tile_N), stride=(1, 0))
-                )
-            )
-            if const_expr(tiled_copy_t2r is not None):
-                tDsColVec = tiled_copy_r2s.retile(tDsColVec)
-        if const_expr(params.mRowVecBroadcast is not None or params.mColVecBroadcast is not None):
-            cute.arch.cp_async_commit_group()
-            cute.arch.cp_async_wait_group(0)
-            epilogue_barrier.arrive_and_wait()
-        return alpha, beta, tDsRowVec, tDsColVec
-    def epi_begin_loop(self, params: EpilogueParams, epi_tensors, epi_coord: cute.Coord):
-        alpha, beta, tDsRowVec, tDsColVec = epi_tensors
-        tDrRowVec_cvt = None
-        if const_expr(tDsRowVec is not None):
-            tDsRowVec_cur = cute.group_modes(tDsRowVec, 3, cute.rank(tDsRowVec))[
-                None, None, None, epi_coord
-            ]
-            # tDrRowVec = cute.make_fragment_like(tDsRowVec_cur)
-            tDrRowVec = cute.make_fragment(tDsRowVec_cur.layout, tDsRowVec_cur.element_type)
-            cute.autovec_copy(cute.filter_zeros(tDsRowVec_cur), cute.filter_zeros(tDrRowVec))
-            tDrRowVec_cvt = cute.make_fragment_like(tDrRowVec, self.acc_dtype)
-            tDrRowVec_cvt.store(tDrRowVec.load().to(self.acc_dtype))
-        tDrColVec_cvt = None
-        if const_expr(tDsColVec is not None):
-            tDsColVec_cur = cute.group_modes(tDsColVec, 3, cute.rank(tDsColVec))[
-                None, None, None, epi_coord
-            ]
-            # This somehow doesn't work, some dim with stride 0 turns to non-zero stride
-            # tDrRowVec = cute.make_fragment_like(tDsRowVec_cur)
-            tDrColVec = cute.make_fragment(tDsColVec_cur.layout, tDsColVec_cur.element_type)
-            cute.autovec_copy(cute.filter_zeros(tDsColVec_cur), cute.filter_zeros(tDrColVec))
-            tDrColVec_cvt = cute.make_fragment_like(tDrColVec, self.acc_dtype)
-            tDrColVec_cvt.store(tDrColVec.load().to(self.acc_dtype))
-        return alpha, beta, tDrRowVec_cvt, tDrColVec_cvt
     @cute.jit
     def epi_visit_subtile(
         self,
-        params: EpilogueParams,
-        epi_loop_tensors: Tuple[cute.Tensor, ...],
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
     ) -> Optional[cute.Tensor]:
-        alpha, beta, tDrRowVec, tDrColVec = epi_loop_tensors
         rD = tRS_rD.load()
         # Apply alpha scaling to accumulator if alpha is provided (not None)
         if const_expr(hasattr(params, "alpha") and params.alpha is not None):
@@ -206,49 +79,25 @@ class GemmDefaultEpiMixin:
                 tRS_rD[i] += tDrColVec[i]
         return None
-    @staticmethod
-    def epi_smem_bytes_per_stage(
-        args: Optional[EpilogueArguments],
-        cta_tile_shape_mnk: Tuple[int, int, int],
-        epi_tile: cute.Tile,
-    ) -> int:
-        row_vec_smem_size = 0 if args.mRowVecBroadcast is None else cta_tile_shape_mnk[1]
-        col_vec_smem_size = 0 if args.mColVecBroadcast is None else cta_tile_shape_mnk[0]
-        row_vec_dtype = (
-            args.mRowVecBroadcast.element_type if args.mRowVecBroadcast is not None else Float32
-        )
-        col_vec_dtype = (
-            args.mColVecBroadcast.element_type if args.mColVecBroadcast is not None else Float32
-        )
-        return (
-            row_vec_smem_size * row_vec_dtype.width + col_vec_smem_size * col_vec_dtype.width
-        ) // 8
-    def epi_get_smem_struct(self, params: EpilogueParams):
-        row_vec_smem_size = 0 if params.mRowVecBroadcast is None else self.cta_tile_shape_mnk[1]
-        col_vec_smem_size = 0 if params.mColVecBroadcast is None else self.cta_tile_shape_mnk[0]
-        row_vec_dtype = (
-            params.mRowVecBroadcast.element_type if params.mRowVecBroadcast is not None else Float32
-        )
-        col_vec_dtype = (
-            params.mColVecBroadcast.element_type if params.mColVecBroadcast is not None else Float32
-        )
-        @cute.struct
-        class EpiSharedStorage:
-            sRowVec: cute.struct.Align[cute.struct.MemRange[row_vec_dtype, row_vec_smem_size], 16]
-            sColVec: cute.struct.Align[cute.struct.MemRange[col_vec_dtype, col_vec_smem_size], 16]
-        return EpiSharedStorage
-    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
-        sRowVec = None
-        if const_expr(params.mRowVecBroadcast is not None):
-            sRowVec = storage.epi.sRowVec.get_tensor(cute.make_layout(self.cta_tile_shape_mnk[1]))
-        sColVec = None
-        if const_expr(params.mColVecBroadcast is not None):
-            sColVec = storage.epi.sColVec.get_tensor(cute.make_layout(self.cta_tile_shape_mnk[0]))
-        return (sRowVec, sColVec)
 class GemmDefaultSm90(GemmDefaultEpiMixin, GemmSm90):
@@ -257,3 +106,7 @@ class GemmDefaultSm90(GemmDefaultEpiMixin, GemmSm90):
 class GemmDefaultSm100(GemmDefaultEpiMixin, GemmSm100):
     pass

 # Copyright (c) 2025, Wentao Guo, Tri Dao.
+from typing import NamedTuple, Optional
 import cutlass
 import cutlass.cute as cute
+from cutlass import Int32, Float32, const_expr
+from .cute_dsl_utils import mlir_namedtuple
+from .epi_composable import ComposableEpiMixin
+from .epi_ops import Scalar, RowVecLoad, ColVecLoad
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
+from .rounding import RoundingMode
+from . import layout_utils as layout_utils
 from . import utils as utils
+class GemmDefaultEpiMixin(ComposableEpiMixin):
+    _epi_ops = (
+        Scalar("alpha"),
+        Scalar("beta"),
+        Scalar("sr_seed", dtype=Int32),
+        RowVecLoad("mRowVecBroadcast"),
+        ColVecLoad("mColVecBroadcast"),
+    )
+    @mlir_namedtuple
+    class EpilogueArguments(NamedTuple):
         alpha: Optional[Float32 | cute.Tensor] = None
         beta: Optional[Float32 | cute.Tensor] = None
         mRowVecBroadcast: Optional[cute.Tensor] = None
         mColVecBroadcast: Optional[cute.Tensor] = None
+        add_to_output: cutlass.Constexpr[bool] = False
+        rounding_mode: cutlass.Constexpr[int] = RoundingMode.RN
+        sr_seed: Optional[Int32 | cute.Tensor] = None
+    # EpilogueParams auto-generated from _epi_ops
+    def epi_to_underlying_arguments(self, args, *, loc=None, ip=None):
+        self.rounding_mode = args.rounding_mode
+        d = self._epi_ops_to_params_dict(args)
+        for key in ("mRowVecBroadcast", "mColVecBroadcast"):
+            if key in self.concat_layout and key in d and d[key] is not None:
+                d[key] = layout_utils.concat_to_interleave(d[key], 1)
+        return self.EpilogueParams(**d)
     @cute.jit
     def epi_visit_subtile(
         self,
+        params,
+        epi_loop_tensors,
         tRS_rD: cute.Tensor,
         tRS_rC: Optional[cute.Tensor] = None,
     ) -> Optional[cute.Tensor]:
+        alpha = epi_loop_tensors["alpha"]
+        beta = epi_loop_tensors["beta"]
+        tDrRowVec = epi_loop_tensors["mRowVecBroadcast"]
+        tDrColVec = epi_loop_tensors["mColVecBroadcast"]
         rD = tRS_rD.load()
         # Apply alpha scaling to accumulator if alpha is provided (not None)
         if const_expr(hasattr(params, "alpha") and params.alpha is not None):
                 tRS_rD[i] += tDrColVec[i]
         return None
+    def epi_setup_postact(
+        self,
+        params,
+        epi_smem_tensors,
+        tiled_copy_r2s,
+        tiled_copy_t2r,
+        tile_coord_mnkl,
+        varlen_manager,
+        tidx,
+    ):
+        """Returns None — default epilogue has no postact output."""
+        return None
+    @cute.jit
+    def epi_convert_postact(
+        self, tRS_rPostAct, sr_seed, tidx, tile_coord_mnkl, num_prev_subtiles, epi_idx
+    ):
+        """Convert postact from acc_dtype to output dtype. Override for custom postprocessing."""
+        return tRS_rPostAct
 class GemmDefaultSm90(GemmDefaultEpiMixin, GemmSm90):
 class GemmDefaultSm100(GemmDefaultEpiMixin, GemmSm100):
     pass
+class GemmDefaultSm120(GemmDefaultEpiMixin, GemmSm120):
+    pass

build/torch-cuda/quack/gemm_interface.py CHANGED Viewed

@@ -3,18 +3,22 @@ from typing import Optional, Tuple, Literal
 from functools import partial
 import torch
 import torch.nn.functional as F
 from torch import Tensor
-from ._ops_compat import add_quack_op_namespace_prefix
 from .gemm_config import GemmConfig, get_all_configs
 from .autotuner import autotune, AutotuneConfig
 from .cute_dsl_utils import get_device_capacity
-from .gemm import gemm as gemm_sm90_sm100
-from .gemm_act import gemm_act as gemm_act_sm90_sm100
-from .gemm_dact import gemm_dact as gemm_dact_sm90_sm100
-from .gemm_symmetric import gemm_symmetric as gemm_symmetric_sm90_sm100
 # Dictionary mapping activation names to PyTorch functions
@@ -37,54 +41,100 @@ gated_to_pytorch_fn_map = {
 }
-def _get_default_device_capacity():
-    if not torch.cuda.is_available():
-        return (9, 0)
-    cap = get_device_capacity(torch.device("cuda"))
-    if cap[0] not in (9, 10):
-        return (9, 0)
-    return cap
-class _LazyDeviceCapacity:
-    """Defer torch.cuda.get_device_capability until first access so the
-    module can be imported in environments without a GPU (e.g. nix build)."""
-    _value = None
-    def __getitem__(self, idx):
-        if self._value is None:
-            self._value = _get_default_device_capacity()
-        return self._value[idx]
-default_device_capacity = _LazyDeviceCapacity()
 def default_config(device):
-    if get_device_capacity(device)[0] != 10:
-        return GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
     else:
-        return GemmConfig(tile_m=256, tile_n=256, cluster_m=2, cluster_n=1, pingpong=False)
 def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
     kwargs = named_args | kwargs
     gather_A = kwargs.get("A_idx", None) is not None
     varlen_m = kwargs.get("cu_seqlens_m", None) is not None
     if varlen_m or gather_A:  # Doesn't support swap_ab
         configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
     if gather_A:
-        if get_device_capacity(kwargs["A"].device)[0] == 9:
-            # tile_n == 208 causes register spills, as gather_A requires more registers for the producer
-            configs = [
-                conf
-                for conf in configs
-                if conf.kwargs["config"].cluster_n == 1 and conf.kwargs["config"].tile_n != 208
-            ]
     return configs
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -104,9 +154,25 @@ def gemm_tuned(
     add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     config: Optional[GemmConfig] = None,
 ) -> None:
     if config is None:
-        config = default_config(A.device)
     varlen_m = cu_seqlens_m is not None
     varlen_k = cu_seqlens_k is not None
     varlen = varlen_m or varlen_k
@@ -135,10 +201,31 @@ def gemm_tuned(
     else:
         out_shape = (batch_size, A.shape[-2], B.shape[-2])
     assert out.shape == out_shape, f"out shape mismatch: {out.shape} vs {out_shape}"
     tile_count_semaphore = (
-        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         out if not config.swap_ab else out.mT,
@@ -150,6 +237,7 @@ def gemm_tuned(
         config.cluster_n,
         config.pingpong,
         persistent=True,
         max_swizzle_size=config.max_swizzle_size,
         rowvec_bias=bias if not config.swap_ab else None,
         colvec_bias=bias if config.swap_ab else None,
@@ -160,11 +248,15 @@ def gemm_tuned(
         A_idx=A_idx,
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
     )
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -177,7 +269,7 @@ def gemm_act_tuned(
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = False,
@@ -205,10 +297,13 @@ def gemm_act_tuned(
         PostAct = postact_out
     if bias is not None and bias.ndim == 1:
         bias = bias.unsqueeze(0)  # (L, N)
     tile_count_semaphore = (
-        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_act_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         (D if not config.swap_ab else D.mT) if D is not None else None,
@@ -222,16 +317,18 @@ def gemm_act_tuned(
         config.cluster_n,
         config.pingpong,
         persistent=True,
         max_swizzle_size=config.max_swizzle_size,
         rowvec_bias=bias if not config.swap_ab else None,
         colvec_bias=bias if config.swap_ab else None,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
     )
 @autotune(
-    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
@@ -242,7 +339,7 @@ def gemm_dact_tuned(
     PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Tensor,  # (M, N) or (L, N, N) or (total_M, N) if varlen_m
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
@@ -268,10 +365,13 @@ def gemm_dact_tuned(
         PostAct = postact_out.unsqueeze(0)
     else:
         PostAct = postact_out
     tile_count_semaphore = (
-        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_dact_sm90_sm100(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         D if not config.swap_ab else D.mT,
@@ -285,9 +385,11 @@ def gemm_dact_tuned(
         config.cluster_n,
         config.pingpong,
         persistent=True,
         max_swizzle_size=config.max_swizzle_size,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
     )
@@ -305,6 +407,9 @@ def gemm(
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tensor:
     """GEMM with optional output tensor and tuning control."""
     if out is None:
@@ -325,6 +430,9 @@ def gemm(
         out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     alpha_tensor = alpha if not isinstance(alpha, float) else None
     alpha = alpha if isinstance(alpha, float) else 1.0
     gemm_out(
         A,
         B,
@@ -338,6 +446,10 @@ def gemm(
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
     )
     return out
@@ -364,10 +476,15 @@ def gemm_out(
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """GEMM with pre-allocated output tensor."""
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
     fn(
         A,
         B,
@@ -380,6 +497,9 @@ def gemm_out(
         A_idx=A_idx,
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
     )
@@ -394,10 +514,18 @@ def gemm_ref(
     cu_seqlens_k: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
 ) -> Tensor:
     """Reference implementation for GEMM with pre-allocated output."""
     # The out_dtype argument requires torch >= 2.8
     out_dtype = A.dtype if out_dtype is None else out_dtype
     if cu_seqlens_m is None and cu_seqlens_k is None:
         fn = torch.bmm if A.ndim == 3 else torch.mm
         out = fn(A, B, out_dtype=out_dtype, out=out)
@@ -438,6 +566,9 @@ def gemm_ref(
             out *= alpha
         if bias is not None:
             out += bias
     return out
@@ -456,6 +587,7 @@ def gemm_add(
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tensor:
     """GEMM with addition and optional output tensor."""
     if out is None:
@@ -480,23 +612,43 @@ def gemm_add(
     alpha = alpha if isinstance(alpha, float) else 1.0
     beta_tensor = beta if not isinstance(beta, float) else None
     beta = beta if isinstance(beta, float) else 1.0
-    gemm_add_out(
-        A,
-        B,
-        C if not add_to_output else None,
-        out,
-        alpha,
-        beta,
-        alpha_tensor,
-        beta_tensor,
-        cu_seqlens_m=cu_seqlens_m,
-        cu_seqlens_k=cu_seqlens_k,
-        A_idx=A_idx,
-        batch_idx_permute=batch_idx_permute,
-        add_to_output=add_to_output,
-        dynamic_scheduler=dynamic_scheduler,
-        tuned=tuned,
-    )
     return out
@@ -525,6 +677,7 @@ def gemm_add_out(
     add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """GEMM with addition and pre-allocated output tensor."""
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
@@ -543,6 +696,7 @@ def gemm_add_out(
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
         dynamic_scheduler=dynamic_scheduler,
     )
@@ -559,8 +713,18 @@ def gemm_add_ref(
     cu_seqlens_k: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
 ) -> Tensor:
     """Reference implementation for GEMM with addition and pre-allocated output."""
     if cu_seqlens_m is None and cu_seqlens_k is None:
         if isinstance(alpha, float) and isinstance(beta, float):
             out = torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
@@ -571,6 +735,8 @@ def gemm_add_ref(
             result = (alpha * (A @ B) + beta * C).to(out_dtype)
             if out is not None:
                 out.copy_(result)
         if bias is not None:
             bias = bias if A.ndim == 2 else bias.unsqueeze(1)
             out += bias
@@ -610,6 +776,8 @@ def gemm_add_ref(
             out[i].copy_(result)
         if bias is not None:
             out += bias
     return out
@@ -626,6 +794,7 @@ def gemm_add_inplace(
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     """In-place GEMM with addition: out = alpha * A @ B + beta * out.
     Args:
@@ -657,6 +826,9 @@ def gemm_add_inplace(
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
     )
@@ -683,6 +855,7 @@ def gemm_add_inplace_op(
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> None:
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
@@ -702,6 +875,7 @@ def gemm_add_inplace_op(
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
         dynamic_scheduler=dynamic_scheduler,
     )
@@ -710,7 +884,7 @@ def gemm_act(
     B: Tensor,  # (K, N) or (L, K, N)
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     preact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
@@ -720,8 +894,10 @@ def gemm_act(
     store_preact: bool = True,
     dynamic_scheduler: bool = False,
     tuned: bool = True,
 ) -> Tuple[Optional[Tensor], Tensor]:
-    """GEMM with activation and optional output tensors."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     varlen_m = cu_seqlens_m is not None
@@ -733,26 +909,47 @@ def gemm_act(
         out_shape = (A.shape[0], B.shape[-1])
     else:
         out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
     if preact_out is None and store_preact:
         preact_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
-        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
-    gemm_act_out(
-        A,
-        B,
-        preact_out,
-        postact_out,
-        C,
-        bias,
-        activation,
-        cu_seqlens_m,
-        A_idx,
-        dynamic_scheduler,
-        tuned,
-    )
     return preact_out, postact_out
 @torch.library.custom_op(
     add_quack_op_namespace_prefix("gemm_act_out"),
     mutates_args=("preact_out", "postact_out"),
@@ -766,7 +963,7 @@ def gemm_act_out(
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = False,
@@ -782,57 +979,111 @@ def gemm_act_ref(
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
     store_preact: bool = True,
 ) -> Tuple[Optional[Tensor], Tensor]:
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     if C is None:
-        out = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     else:
-        out = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
-    postact = act_to_pytorch_fn_map[activation](out).to(postact_dtype)
-    return out.to(out_dtype) if store_preact else None, postact
 def gemm_dact(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
     B: Tensor,  # (K, N) or (L, K, N)
-    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
-    dx_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
     tuned: bool = True,
-) -> Tuple[Tensor, Tensor]:
-    """GEMM with activation gradient and optional output tensors."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
     varlen_m = cu_seqlens_m is not None
-    # Determine output shape based on gather_A
     if varlen_m:
         total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
-        out_shape = (total_m, B.shape[-1])
     elif A.ndim == 2:
-        out_shape = (A.shape[0], B.shape[-1])
     else:
-        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
     if dx_out is None:
         dx_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
-        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
-    gemm_dact_out(
-        A, B, PreAct, dx_out, postact_out, activation, cu_seqlens_m, A_idx, dynamic_scheduler, tuned
-    )
-    return dx_out, postact_out
 @torch.library.custom_op(
@@ -847,7 +1098,7 @@ def gemm_dact_out(
     PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
@@ -859,115 +1110,46 @@ def gemm_dact_out(
 def gemm_dact_ref(
-    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
-    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
-    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
-    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
 ) -> Tuple[Tensor, Tensor]:
-    """Reference implementation for GEMM with activation gradient."""
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
     dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
-    postact = act_to_pytorch_fn_map[activation](PreAct)
-    # Compute gradient using autograd
-    if activation is None:
-        dx = dout
-    else:
-        PreAct_requires_grad = PreAct.requires_grad
-        PreAct.requires_grad_(True)
-        postact_for_grad = act_to_pytorch_fn_map[activation](PreAct)
-        dx = torch.autograd.grad(postact_for_grad, PreAct, dout, create_graph=False)[0]
-        PreAct.requires_grad_(PreAct_requires_grad)
-    return dx.to(out_dtype), postact.to(postact_dtype)
-def gemm_gated_ref(
-    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
-    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
-    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
-    bias: Optional[Tensor] = None,  # (N,) or (L, N)
-    activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"] = "swiglu",
-    cu_seqlens_m: Optional[Tensor] = None,
-    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
-    out_dtype: Optional[torch.dtype] = None,
-    postact_dtype: Optional[torch.dtype] = None,
-    store_preact: bool = True,
-) -> Tuple[Optional[Tensor], Tensor]:
-    """Reference implementation for GEMM with gated activation forward.
-    Args:
-        A: (M, K) - input tensor
-        B: (K, N) - weight tensor with gate and up projections
-        C: (M, N) - optional bias tensor
-        activation: Type of gated activation
-        out_dtype: Output dtype for preact
-        postact_dtype: Output dtype for postact
-        store_preact: Whether to return the pre-activation
-    Returns:
-        (preact, postact) where:
-        - preact: (M, N) pre-activation (if store_preact=True, else None)
-        - postact: (M, N // 2) post-activation output
-    """
-    out_dtype = A.dtype if out_dtype is None else out_dtype
-    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
-    if C is None:
-        preact = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
     else:
-        preact = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
-    # Split preact into gate and up projections
-    gate = preact[..., ::2]  # (M, N//2)
-    up = preact[..., 1::2]  # (M, N//2)
-    postact = gated_to_pytorch_fn_map[activation](gate, up)
-    return preact.to(out_dtype) if store_preact else None, postact.to(postact_dtype)
-def gemm_dgated_ref(
-    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
-    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
-    PreAct: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
-    activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"],
-    cu_seqlens_m: Optional[Tensor] = None,
-    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
-    out_dtype: Optional[torch.dtype] = None,
-    postact_dtype: Optional[torch.dtype] = None,
-) -> Tuple[Tensor, Tensor]:
-    """Reference implementation for GEMM with gated activation gradient.
-    Args:
-        A: (M, K) - dout input tensor
-        B: (K, N) - weight tensor
-        PreAct: (M, 2*N) - pre-activation tensor with gate and up projections interleaved
-        activation: Type of gated activation
-        out_dtype: Output dtype for dx
-        postact_dtype: Output dtype for postact
-    Returns:
-        (dx, postact) where:
-        - dx: (M, 2*N) gradient w.r.t. PreAct
-        - postact: (M, N) post-activation output
-    """
-    out_dtype = A.dtype if out_dtype is None else out_dtype
-    postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
-    dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
-    # Split PreAct into gate and up projections
-    gate = PreAct[..., ::2]  # (M, N)
-    up = PreAct[..., 1::2]  # (M, N)
-    # Use autograd to compute gradients w.r.t. gate and up
-    gate_requires_grad, up_requires_grad = gate.requires_grad, up.requires_grad
-    gate.requires_grad_(True)
-    up.requires_grad_(True)
-    postact = gated_to_pytorch_fn_map[activation](gate, up)
-    dgate, dup = torch.autograd.grad(postact, [gate, up], dout, create_graph=False)
-    gate.requires_grad_(gate_requires_grad)
-    up.requires_grad_(up_requires_grad)
-    # Interleave gradients back
-    dx = torch.stack([dgate, dup], dim=-1).reshape(PreAct.shape)
-    return dx.to(out_dtype), postact.to(postact_dtype)
 @torch.library.custom_op(
@@ -1000,18 +1182,27 @@ def gemm_symmetric_out(
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
-    gemm_symmetric_sm90_sm100(
         A,
         B,
         out if out is not None else None,
         C if C is not None else None,
         tile_count_semaphore,
-        tile_M=128,
-        tile_N=256,
-        cluster_M=2,
         cluster_N=1,
-        pingpong=False,
         persistent=True,
         max_swizzle_size=8,
         alpha=alpha,
         beta=beta,
@@ -1047,6 +1238,933 @@ def gemm_symmetric(
     return out
 # TODO: this is not quite right, do we need to register gemm_add not gemm_add_out?
 # try:
 #     from torch._inductor.fx_passes.reinplace import InplaceableOp

 from functools import partial
 import torch
+from ._ops_compat import add_quack_op_namespace_prefix
 import torch.nn.functional as F
 from torch import Tensor
 from .gemm_config import GemmConfig, get_all_configs
 from .autotuner import autotune, AutotuneConfig
 from .cute_dsl_utils import get_device_capacity
+from .gemm import gemm as gemm_dispatch
+from .gemm_act import gemm_act as gemm_act_dispatch
+from .gemm_dact import gemm_dact as gemm_dact_dispatch
+from .gemm_symmetric import gemm_symmetric as gemm_symmetric_dispatch
+from .gemm_sq_reduce import gemm_sq_reduce as gemm_sq_reduce_dispatch
+from .gemm_norm_act import gemm_norm_act_fn as gemm_norm_act_dispatch
+from .rms_final_reduce import rms_final_reduce
+from .rounding import RoundingMode
 # Dictionary mapping activation names to PyTorch functions
 }
+ActActivation = Literal[None, "relu", "relu_sq", "gelu_tanh_approx"]
+GatedActivation = Literal["swiglu", "swiglu_oai", "reglu", "geglu", "glu"]
+Activation = Literal[
+    None,
+    "relu",
+    "relu_sq",
+    "gelu_tanh_approx",
+    "swiglu",
+    "swiglu_oai",
+    "reglu",
+    "geglu",
+    "glu",
+]
+def _concat_interleave(t):
+    """Interleave halves along non-contiguous dim: [first; second] → [f0, s0, f1, ...]"""
+    dim = -2 if t.stride(-1) == 1 else -1
+    return t.unflatten(dim, (2, t.shape[dim] // 2)).transpose(dim - 1, dim).flatten(dim - 1, dim)
+def _concat_interleave_bias(t):
+    """Interleave [gate; up] along last dim for bias vectors."""
+    half = t.shape[-1] // 2
+    return t.unflatten(-1, (2, half)).transpose(-2, -1).flatten(-2, -1)
 def default_config(device):
+    cap = get_device_capacity(device)[0]
+    if cap in [10, 11]:
+        return GemmConfig(
+            tile_m=256,
+            tile_n=256,
+            cluster_m=2,
+            cluster_n=1,
+            pingpong=False,
+            is_dynamic_persistent=True,
+            device_capacity=10,
+        )
+    elif cap == 12:
+        return GemmConfig(
+            tile_m=128,
+            tile_n=128,
+            cluster_m=1,
+            cluster_n=1,
+            pingpong=True,
+            is_dynamic_persistent=True,
+            device_capacity=12,
+        )
     else:
+        return GemmConfig(
+            tile_m=128,
+            tile_n=192,
+            cluster_m=2,
+            cluster_n=1,
+            pingpong=True,
+            is_dynamic_persistent=False,
+        )
+def nvmmh_config(A, B, device_capacity):
+    """Use nvMatmulHeuristics to pick a config for pure GEMM (no varlen/gather/epilogue).
+    Returns None if unavailable, caller should fall back to default_config.
+    """
+    try:
+        from .nvmmh_heuristic import nvmmh_default_config
+        return nvmmh_default_config(A, B, device_capacity)
+    except Exception:
+        return None
 def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
     kwargs = named_args | kwargs
+    device_capacity = get_device_capacity(kwargs["A"].device)[0]
+    configs = [conf for conf in configs if conf.kwargs["config"].device_capacity == device_capacity]
     gather_A = kwargs.get("A_idx", None) is not None
     varlen_m = kwargs.get("cu_seqlens_m", None) is not None
     if varlen_m or gather_A:  # Doesn't support swap_ab
         configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
     if gather_A:
+        configs = [conf for conf in configs if conf.kwargs["config"].cluster_n == 1]
+        if device_capacity == 9:
+            configs = [conf for conf in configs if conf.kwargs["config"].tile_n != 208]
+            configs = [conf for conf in configs if not conf.kwargs["config"].is_dynamic_persistent]
+    # use_tma_gather only valid when gather_A is active on SM100/SM110
+    if not gather_A or device_capacity not in [10, 11]:
+        configs = [conf for conf in configs if not conf.kwargs["config"].use_tma_gather]
     return configs
 @autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
     key=["dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
     add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     config: Optional[GemmConfig] = None,
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int | Tensor = 0,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> None:
     if config is None:
+        # Use nvMMH heuristic for pure GEMM (no varlen, no gather, no epilogue)
+        is_pure_gemm = (
+            cu_seqlens_m is None
+            and cu_seqlens_k is None
+            and A_idx is None
+            and C is None
+            and bias is None
+            and not add_to_output
+        )
+        if is_pure_gemm:
+            device_capacity = get_device_capacity(A.device)[0]
+            config = nvmmh_config(A, B, device_capacity)
+        if config is None:
+            config = default_config(A.device)
     varlen_m = cu_seqlens_m is not None
     varlen_k = cu_seqlens_k is not None
     varlen = varlen_m or varlen_k
     else:
         out_shape = (batch_size, A.shape[-2], B.shape[-2])
     assert out.shape == out_shape, f"out shape mismatch: {out.shape} vs {out_shape}"
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
     tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    # Handle bias concat layout: transform "bias" key to kernel-level key or permute data.
+    if concat_layout and "bias" in concat_layout:
+        if bias is not None and bias.dtype.itemsize >= 4:
+            # fp32: kernel permutes via layout; replace "bias" with the kernel-level key
+            concat_layout = tuple("mRowVecBroadcast" if k == "bias" else k for k in concat_layout)
+        else:
+            # No bias or sub-fp32: strip "bias" from concat_layout; permute data if needed
+            concat_layout = tuple(k for k in concat_layout if k != "bias")
+            if bias is not None:
+                bias = _concat_interleave_bias(bias)
+    # When swap_ab, A↔B (out/C stay, but .mT flips their strides so the kernel
+    # auto-detects the correct non-contiguous dim).
+    _swap_map = {"A": "B", "B": "A", "out": "out", "C": "C", "mRowVecBroadcast": "mColVecBroadcast"}
+    swapped_concat = (
+        tuple(_swap_map.get(k, k) for k in concat_layout)
+        if config.swap_ab and concat_layout
+        else concat_layout
     )
+    gemm_dispatch(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         out if not config.swap_ab else out.mT,
         config.cluster_n,
         config.pingpong,
         persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
         max_swizzle_size=config.max_swizzle_size,
         rowvec_bias=bias if not config.swap_ab else None,
         colvec_bias=bias if config.swap_ab else None,
         A_idx=A_idx,
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
+        rounding_mode=rounding_mode,
+        sr_seed=sr_seed,
+        use_tma_gather=config.use_tma_gather,
+        concat_layout=swapped_concat,
     )
 @autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: ActActivation = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = False,
         PostAct = postact_out
     if bias is not None and bias.ndim == 1:
         bias = bias.unsqueeze(0)  # (L, N)
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
     tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
     )
+    gemm_act_dispatch(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         (D if not config.swap_ab else D.mT) if D is not None else None,
         config.cluster_n,
         config.pingpong,
         persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
         max_swizzle_size=config.max_swizzle_size,
         rowvec_bias=bias if not config.swap_ab else None,
         colvec_bias=bias if config.swap_ab else None,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
+        use_tma_gather=config.use_tma_gather,
     )
 @autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
     key=["activation", "dynamic_scheduler"],
     prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
 )
     PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Tensor,  # (M, N) or (L, N, N) or (total_M, N) if varlen_m
+    activation: ActActivation = None,
     cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
         PostAct = postact_out.unsqueeze(0)
     else:
         PostAct = postact_out
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
     tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
     )
+    gemm_dact_dispatch(
         A if not config.swap_ab else B,
         B if not config.swap_ab else A,
         D if not config.swap_ab else D.mT,
         config.cluster_n,
         config.pingpong,
         persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
         max_swizzle_size=config.max_swizzle_size,
         cu_seqlens_m=cu_seqlens_m,
         A_idx=A_idx,
+        use_tma_gather=config.use_tma_gather,
     )
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int | Tensor = 0,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tensor:
     """GEMM with optional output tensor and tuning control."""
     if out is None:
         out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     alpha_tensor = alpha if not isinstance(alpha, float) else None
     alpha = alpha if isinstance(alpha, float) else 1.0
+    sr_seed_tensor = sr_seed if isinstance(sr_seed, Tensor) else None
+    sr_seed_int = sr_seed if isinstance(sr_seed, int) else 0
+    concat_str = ",".join(concat_layout) if concat_layout else None
     gemm_out(
         A,
         B,
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
+        rounding_mode=rounding_mode,
+        sr_seed=sr_seed_int,
+        sr_seed_tensor=sr_seed_tensor,
+        concat_layout=concat_str,
     )
     return out
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int = 0,
+    sr_seed_tensor: Optional[Tensor] = None,
+    concat_layout: Optional[str] = None,
 ) -> None:
     """GEMM with pre-allocated output tensor."""
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
+    sr_seed_arg = sr_seed_tensor if sr_seed_tensor is not None else sr_seed
     fn(
         A,
         B,
         A_idx=A_idx,
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
+        rounding_mode=rounding_mode,
+        sr_seed=sr_seed_arg,
+        concat_layout=tuple(concat_layout.split(",")) if concat_layout else None,
     )
     cu_seqlens_k: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tensor:
     """Reference implementation for GEMM with pre-allocated output."""
     # The out_dtype argument requires torch >= 2.8
     out_dtype = A.dtype if out_dtype is None else out_dtype
+    if concat_layout:
+        if "A" in concat_layout:
+            A = _concat_interleave(A)
+        if "B" in concat_layout:
+            B = _concat_interleave(B)
+        if "bias" in concat_layout and bias is not None:
+            bias = _concat_interleave_bias(bias)
     if cu_seqlens_m is None and cu_seqlens_k is None:
         fn = torch.bmm if A.ndim == 3 else torch.mm
         out = fn(A, B, out_dtype=out_dtype, out=out)
             out *= alpha
         if bias is not None:
             out += bias
+    if concat_layout and "out" in concat_layout:
+        # out is n-major (ref allocates contiguous). Split rows (non-contiguous dim).
+        out = torch.cat([out[..., ::2, :], out[..., 1::2, :]], dim=-2)
     return out
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tensor:
     """GEMM with addition and optional output tensor."""
     if out is None:
     alpha = alpha if isinstance(alpha, float) else 1.0
     beta_tensor = beta if not isinstance(beta, float) else None
     beta = beta if isinstance(beta, float) else 1.0
+    alpha_arg = alpha_tensor if alpha_tensor is not None else alpha
+    beta_arg = beta_tensor if beta_tensor is not None else beta
+    concat_str = ",".join(concat_layout) if concat_layout else None
+    if add_to_output:
+        gemm_add_inplace(
+            A,
+            B,
+            out,
+            alpha=alpha_arg,
+            beta=beta_arg,
+            cu_seqlens_m=cu_seqlens_m,
+            cu_seqlens_k=cu_seqlens_k,
+            A_idx=A_idx,
+            batch_idx_permute=batch_idx_permute,
+            dynamic_scheduler=dynamic_scheduler,
+            tuned=tuned,
+            concat_layout=concat_str,
+        )
+    else:
+        gemm_add_out(
+            A,
+            B,
+            C,
+            out,
+            alpha,
+            beta,
+            alpha_tensor,
+            beta_tensor,
+            cu_seqlens_m=cu_seqlens_m,
+            cu_seqlens_k=cu_seqlens_k,
+            A_idx=A_idx,
+            batch_idx_permute=batch_idx_permute,
+            add_to_output=add_to_output,
+            dynamic_scheduler=dynamic_scheduler,
+            tuned=tuned,
+            concat_layout=concat_str,
+        )
     return out
     add_to_output: bool = False,
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    concat_layout: Optional[str] = None,
 ) -> None:
     """GEMM with addition and pre-allocated output tensor."""
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
         dynamic_scheduler=dynamic_scheduler,
+        concat_layout=tuple(concat_layout.split(",")) if concat_layout else None,
     )
     cu_seqlens_k: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
     out_dtype: Optional[torch.dtype] = None,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tensor:
     """Reference implementation for GEMM with addition and pre-allocated output."""
+    if concat_layout:
+        if "A" in concat_layout:
+            A = _concat_interleave(A)
+        if "B" in concat_layout:
+            B = _concat_interleave(B)
+        if "bias" in concat_layout and bias is not None:
+            bias = _concat_interleave_bias(bias)
+        if "C" in concat_layout:
+            C = _concat_interleave(C)
     if cu_seqlens_m is None and cu_seqlens_k is None:
         if isinstance(alpha, float) and isinstance(beta, float):
             out = torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
             result = (alpha * (A @ B) + beta * C).to(out_dtype)
             if out is not None:
                 out.copy_(result)
+            else:
+                out = result
         if bias is not None:
             bias = bias if A.ndim == 2 else bias.unsqueeze(1)
             out += bias
             out[i].copy_(result)
         if bias is not None:
             out += bias
+    if concat_layout and "out" in concat_layout:
+        out = torch.cat([out[..., ::2, :], out[..., 1::2, :]], dim=-2)
     return out
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> None:
     """In-place GEMM with addition: out = alpha * A @ B + beta * out.
     Args:
         batch_idx_permute=batch_idx_permute,
         dynamic_scheduler=dynamic_scheduler,
         tuned=tuned,
+        concat_layout=",".join(concat_layout)
+        if isinstance(concat_layout, tuple)
+        else concat_layout,
     )
     batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    concat_layout: Optional[str] = None,
 ) -> None:
     fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
     alpha = alpha_tensor if alpha_tensor is not None else alpha
         batch_idx_permute=batch_idx_permute,
         add_to_output=add_to_output,
         dynamic_scheduler=dynamic_scheduler,
+        concat_layout=tuple(concat_layout.split(",")) if concat_layout else None,
     )
     B: Tensor,  # (K, N) or (L, K, N)
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Activation = None,
     preact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
     store_preact: bool = True,
     dynamic_scheduler: bool = False,
     tuned: bool = True,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM with activation (or gated activation) and optional output tensors."""
+    is_gated = activation in gated_to_pytorch_fn_map
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     varlen_m = cu_seqlens_m is not None
         out_shape = (A.shape[0], B.shape[-1])
     else:
         out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    postact_shape = (*out_shape[:-1], out_shape[-1] // 2) if is_gated else out_shape
     if preact_out is None and store_preact:
         preact_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
+        postact_out = torch.empty(postact_shape, dtype=postact_dtype, device=A.device)
+    concat_str = ",".join(concat_layout) if concat_layout else None
+    if is_gated:
+        gemm_gated_out(
+            A,
+            B,
+            preact_out,
+            postact_out,
+            C,
+            bias,
+            activation,
+            cu_seqlens_m,
+            A_idx,
+            dynamic_scheduler,
+            tuned,
+            concat_layout=concat_str,
+        )
+    else:
+        gemm_act_out(
+            A,
+            B,
+            preact_out,
+            postact_out,
+            C,
+            bias,
+            activation,
+            cu_seqlens_m,
+            A_idx,
+            dynamic_scheduler,
+            tuned,
+        )
     return preact_out, postact_out
+gemm_gated = gemm_act
 @torch.library.custom_op(
     add_quack_op_namespace_prefix("gemm_act_out"),
     mutates_args=("preact_out", "postact_out"),
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: ActActivation = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = False,
     B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
     C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Activation = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
     store_preact: bool = True,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
 ) -> Tuple[Optional[Tensor], Tensor]:
+    is_gated = activation in gated_to_pytorch_fn_map
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = A.dtype if postact_dtype is None else postact_dtype
     if C is None:
+        preact = gemm_ref(
+            A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx, concat_layout=concat_layout
+        )
     else:
+        preact = gemm_add_ref(
+            A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx, concat_layout=concat_layout
+        )
+    if is_gated:
+        # With concat=("B",), gemm_ref already interleaves the output columns,
+        # so we always use the interleaved gate/up split.
+        gate = preact[..., ::2]
+        up = preact[..., 1::2]
+        postact = gated_to_pytorch_fn_map[activation](gate, up).to(postact_dtype)
+    else:
+        postact = act_to_pytorch_fn_map[activation](preact).to(postact_dtype)
+    return preact.to(out_dtype) if store_preact else None, postact
+gemm_gated_ref = gemm_act_ref
 def gemm_dact(
     A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
     B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m; or (M, 2*N) for dgated
+    activation: Activation = None,
+    dx_out: Optional[
+        Tensor
+    ] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m; double for gated
     postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
+    colvec_scale: Optional[Tensor] = None,  # (M,) or (L, M) or (total_M,) if varlen_m (dgated only)
+    colvec_reduce: bool = False,  # dgated only
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
     tuned: bool = True,
+):
+    """GEMM with activation (or gated activation) gradient and optional output tensors."""
+    is_dgated = activation in gated_to_pytorch_fn_map
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
     varlen_m = cu_seqlens_m is not None
     if varlen_m:
         total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-1] * 2) if is_dgated else (total_m, B.shape[-1])
     elif A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1] * 2) if is_dgated else (A.shape[0], B.shape[-1])
     else:
+        n = B.shape[-1] * 2 if is_dgated else B.shape[-1]
+        out_shape = (A.shape[0], A.shape[-2], n)
+    postact_shape = (*out_shape[:-1], out_shape[-1] // 2) if is_dgated else out_shape
     if dx_out is None:
         dx_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
     if postact_out is None:
+        postact_out = torch.empty(postact_shape, dtype=postact_dtype, device=A.device)
+    if is_dgated:
+        colvec_reduce_final = gemm_dgated_out(
+            A,
+            B,
+            PreAct,
+            dx_out,
+            postact_out,
+            colvec_scale,
+            activation,
+            colvec_reduce,
+            cu_seqlens_m,
+            A_idx,
+            dynamic_scheduler,
+            tuned,
+        )
+        if not colvec_reduce:
+            return dx_out, postact_out
+        else:
+            return dx_out, postact_out, colvec_reduce_final
+    else:
+        gemm_dact_out(
+            A,
+            B,
+            PreAct,
+            dx_out,
+            postact_out,
+            activation,
+            cu_seqlens_m,
+            A_idx,
+            dynamic_scheduler,
+            tuned,
+        )
+        return dx_out, postact_out
+gemm_dgated = gemm_dact
 @torch.library.custom_op(
     PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
     postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    activation: ActActivation = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     dynamic_scheduler: bool = True,
 def gemm_dact_ref(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N); or (M, 2*N) for dgated
+    activation: Activation = None,
     cu_seqlens_m: Optional[Tensor] = None,
     A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
     out_dtype: Optional[torch.dtype] = None,
     postact_dtype: Optional[torch.dtype] = None,
 ) -> Tuple[Tensor, Tensor]:
+    """Reference implementation for GEMM with activation (or gated activation) gradient."""
+    is_dgated = activation in gated_to_pytorch_fn_map
     out_dtype = A.dtype if out_dtype is None else out_dtype
     postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
     dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
+    if is_dgated:
+        gate = PreAct[..., ::2]
+        up = PreAct[..., 1::2]
+        gate_requires_grad, up_requires_grad = gate.requires_grad, up.requires_grad
+        gate.requires_grad_(True)
+        up.requires_grad_(True)
+        postact = gated_to_pytorch_fn_map[activation](gate, up)
+        dgate, dup = torch.autograd.grad(postact, [gate, up], dout, create_graph=False)
+        gate.requires_grad_(gate_requires_grad)
+        up.requires_grad_(up_requires_grad)
+        dx = torch.stack([dgate, dup], dim=-1).reshape(PreAct.shape)
+        return dx.to(out_dtype), postact.to(postact_dtype)
     else:
+        postact = act_to_pytorch_fn_map[activation](PreAct)
+        if activation is None:
+            dx = dout
+        else:
+            PreAct_requires_grad = PreAct.requires_grad
+            PreAct.requires_grad_(True)
+            postact_for_grad = act_to_pytorch_fn_map[activation](PreAct)
+            dx = torch.autograd.grad(postact_for_grad, PreAct, dout, create_graph=False)[0]
+            PreAct.requires_grad_(PreAct_requires_grad)
+        return dx.to(out_dtype), postact.to(postact_dtype)
+gemm_dgated_ref = gemm_dact_ref
 @torch.library.custom_op(
     tile_count_semaphore = (
         torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
     )
+    sm = get_device_capacity(A.device)[0]
+    # We want square tile per cluster
+    tile_m, tile_n, cluster_m, pingpong = {
+        9: (128, 256, 2, False),
+        10: (256, 256, 2, False),
+        11: (256, 256, 2, False),
+        12: (128, 128, 1, True),
+    }[sm]
+    gemm_symmetric_dispatch(
         A,
         B,
         out if out is not None else None,
         C if C is not None else None,
         tile_count_semaphore,
+        tile_M=tile_m,
+        tile_N=tile_n,
+        cluster_M=cluster_m,
         cluster_N=1,
+        pingpong=pingpong,
         persistent=True,
+        is_dynamic_persistent=sm >= 10,
         max_swizzle_size=8,
         alpha=alpha,
         beta=beta,
     return out
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs("gated")],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_gated_tuned(
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    # (M, N) or (L, M, N) or (total_M, N) if varlen_m - None if not storing preact
+    preact_out: Optional[Tensor],
+    postact_out: Tensor,  # (M, N//2) or (L, M, N//2) or (total_M, N//2) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: GatedActivation = "swiglu",
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+    concat_layout: tuple | None = None,  # tensors whose non-contiguous dim is concat [gate; up]
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if C is not None and C.ndim == 2 and not varlen_m:
+        C = C.unsqueeze(0)  # (1, M, N)
+    if preact_out is not None and preact_out.ndim == 2 and not varlen_m:
+        D = preact_out.unsqueeze(0)
+    else:
+        D = preact_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
+    if concat_layout and "bias" in concat_layout:
+        if bias is not None and bias.dtype.itemsize >= 4:
+            bias_key = "mColVecBroadcast" if config.swap_ab else "mRowVecBroadcast"
+            concat_layout = tuple(bias_key if k == "bias" else k for k in concat_layout)
+        else:
+            concat_layout = tuple(k for k in concat_layout if k != "bias")
+            if bias is not None:
+                bias = _concat_interleave_bias(bias)
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    gemm_act_dispatch(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        (D if not config.swap_ab else D.mT) if D is not None else None,
+        (C if not config.swap_ab else C.mT) if C is not None else None,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+        use_tma_gather=config.use_tma_gather,
+        concat_layout=concat_layout,
+    )
+def prune_invalid_gemm_dgated_configs(configs, named_args: dict, **kwargs):
+    kwargs = named_args | kwargs
+    # if there's colvec_scale or colvec_reduce, don't swap_AB
+    if kwargs.get("colvec_scale", None) is not None or kwargs.get("colvec_reduce", False):
+        configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
+    return prune_invalid_gemm_configs(configs, named_args, **kwargs)
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs("dgated")],
+    key=["activation", "colvec_reduce", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_dgated_configs},
+)
+def gemm_dgated_tuned(
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
+    dx_out: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    colvec_scale: Optional[Tensor] = None,  # (M,) or (L, M) or (total_M,) if varlen_m
+    activation: GatedActivation = "swiglu",
+    # whether to do colvec reduction, returning (M,) or (L, M) or (total_M) if varlen_m
+    colvec_reduce: bool = False,
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = True,
+    config: Optional[GemmConfig] = None,
+) -> Optional[Tensor]:
+    if config is None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    og_ndim_2 = A.ndim == 2 and not varlen_m
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if PreAct.ndim == 2 and not varlen_m:
+        PreAct = PreAct.unsqueeze(0)  # (1, M, 2*N)
+    if dx_out.ndim == 2 and not varlen_m:
+        D = dx_out.unsqueeze(0)
+    else:
+        D = dx_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if colvec_scale is not None and colvec_scale.ndim == 1 and not varlen_m:
+        colvec_scale = colvec_scale.unsqueeze(0)  # (L, N)
+    if colvec_scale is not None:
+        assert not config.swap_ab, "colvec_scale not supported with swap_ab"
+    if colvec_reduce:
+        tile_n = config.tile_n
+        shape_n = (B.shape[-2] + tile_n - 1) // tile_n
+        if varlen_m:
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            colvec_shape = (total_m, shape_n)
+        else:
+            colvec_shape = (A.shape[0], A.shape[-2], shape_n)
+        colvec_reduce_partial = torch.empty(colvec_shape, dtype=torch.float32, device=A.device)
+    else:
+        colvec_reduce_partial = None
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    gemm_dact_dispatch(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        D if not config.swap_ab else D.mT,
+        PreAct if not config.swap_ab else PreAct.mT,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
+        max_swizzle_size=config.max_swizzle_size,
+        colvec_scale=colvec_scale,
+        colvec_reduce=colvec_reduce_partial,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+        use_tma_gather=config.use_tma_gather,
+    )
+    if colvec_reduce:
+        colvec_reduce_final = colvec_reduce_partial.sum(dim=-1)
+        if og_ndim_2:
+            colvec_reduce_final = colvec_reduce_final.squeeze(0)
+    else:
+        colvec_reduce_final = None
+    return colvec_reduce_final
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_gated_out"),
+    mutates_args=("preact_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str activation='swiglu', Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True, str? concat_layout=None) -> ()",
+)
+def gemm_gated_out(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    preact_out: Optional[Tensor],  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N//2) or (L, M, N//2) or (total_M, N//2) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: GatedActivation = "swiglu",
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+    concat_layout: Optional[str] = None,
+) -> None:
+    """GEMM with gated activation and pre-allocated output tensors."""
+    fn = gemm_gated_tuned if tuned else partial(gemm_gated_tuned.fn, config=None)
+    fn(
+        A,
+        B,
+        preact_out,
+        postact_out,
+        C,
+        bias,
+        activation,
+        cu_seqlens_m,
+        A_idx,
+        dynamic_scheduler,
+        concat_layout=tuple(concat_layout.split(",")) if concat_layout else None,
+    )
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_dgated_out"),
+    mutates_args=("dx_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor PreAct, Tensor(a!) dx_out, Tensor(b!) postact_out, Tensor? colvec_scale=None, str activation='swiglu', bool colvec_reduce=False, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> Tensor",
+)
+def gemm_dgated_out(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
+    dx_out: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    colvec_scale: Optional[Tensor] = None,  # (M,) or (L, M) or (total_M,) if varlen_m
+    activation: GatedActivation = "swiglu",
+    colvec_reduce: bool = False,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = True,
+    tuned: bool = True,
+) -> Tensor:
+    """GEMM with gated activation gradient and pre-allocated output tensors."""
+    fn = gemm_dgated_tuned if tuned else partial(gemm_dgated_tuned.fn, config=None)
+    result = fn(
+        A,
+        B,
+        PreAct,
+        dx_out,
+        postact_out,
+        colvec_scale,
+        activation,
+        colvec_reduce,
+        cu_seqlens_m,
+        A_idx,
+        dynamic_scheduler,
+    )
+    if result is None:  # Have to return a tensor, not None, to make torch compile happy
+        return torch.empty(0, device=A.device, dtype=torch.float32)
+    return result
+@torch.library.register_fake(add_quack_op_namespace_prefix("gemm_dgated_out"))
+def gemm_dgated_out_fake(
+    A: Tensor,
+    B: Tensor,
+    PreAct: Tensor,
+    dx_out: Tensor,
+    postact_out: Tensor,
+    colvec_scale: Optional[Tensor] = None,
+    activation: str = "swiglu",
+    colvec_reduce: bool = False,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,
+    dynamic_scheduler: bool = True,
+    tuned: bool = True,
+) -> Tensor:
+    _precompile_default_config(
+        gemm_dgated_tuned,
+        A,
+        B,
+        PreAct,
+        dx_out,
+        postact_out,
+        colvec_scale=colvec_scale,
+        activation=activation,
+        colvec_reduce=colvec_reduce,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+    if not colvec_reduce:
+        return torch.empty(0, dtype=torch.float32, device=A.device)
+    else:
+        if cu_seqlens_m is not None:
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_shape = (total_m,)
+        elif A.ndim == 2:
+            out_shape = (A.shape[0],)
+        else:
+            out_shape = (A.shape[0], A.shape[-2])
+        return torch.empty(out_shape, dtype=torch.float32, device=A.device)
+def _precompile_default_config(autotuned_fn, *args, **kwargs):
+    """Compile the default config in COMPILE_ONLY mode.
+    Checks COMPILE_ONLY flag and SymInt guard, then calls the unwrapped function with
+    config=None (which selects the default config), triggering compilation (exports .o)
+    without benchmarking or kernel launch.
+    Tests use tuned=False which also selects the default config, so this is sufficient.
+    """
+    from .cache_utils import COMPILE_ONLY
+    A = args[0] if args else kwargs.get("A")
+    if not COMPILE_ONLY or A is None or isinstance(A.shape[0], torch.SymInt):
+        return
+    try:
+        autotuned_fn.fn(*args, config=None, **kwargs)
+    except Exception:
+        pass
+@gemm_add_inplace_op.register_fake
+def gemm_add_inplace_fake(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+    alpha_tensor: Optional[Tensor] = None,
+    beta_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,
+    batch_idx_permute: Optional[Tensor] = None,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    alpha_val = alpha_tensor if alpha_tensor is not None else alpha
+    beta_val = beta_tensor if beta_tensor is not None else beta
+    add_to_output = isinstance(beta_val, float) and beta_val == 1.0 and cu_seqlens_m is None
+    _precompile_default_config(
+        gemm_tuned,
+        A,
+        B,
+        out,
+        out if not add_to_output else None,
+        alpha=alpha_val,
+        beta=beta_val,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+def _register_precompile_fake(custom_op, autotuned_fn, rewrite=None):
+    """Register a fake that precompiles the default config in COMPILE_ONLY mode.
+    For custom_ops that forward args to their autotuned fn. Binds all args by name,
+    strips 'tuned', applies optional rewrite(kw), then calls _precompile_default_config.
+    PyTorch normalizes all custom_op args to positional, so we use inspect.signature
+    to recover keyword names.
+    """
+    import inspect
+    sig = inspect.signature(custom_op._init_fn)
+    @custom_op.register_fake
+    def _fake(*args, **kwargs):
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+        kw = dict(bound.arguments)
+        kw.pop("tuned", None)
+        if rewrite is not None:
+            rewrite(kw)
+        _precompile_default_config(autotuned_fn, **kw)
+def _rewrite_merge_alpha(kwargs):
+    """Merge alpha_tensor into alpha for gemm_tuned; add C=None."""
+    at = kwargs.pop("alpha_tensor", None)
+    if at is not None:
+        kwargs["alpha"] = at
+    kwargs.setdefault("C", None)
+def _rewrite_merge_alpha_beta(kwargs):
+    """Merge alpha_tensor/beta_tensor into alpha/beta for gemm_tuned."""
+    at = kwargs.pop("alpha_tensor", None)
+    if at is not None:
+        kwargs["alpha"] = at
+    bt = kwargs.pop("beta_tensor", None)
+    if bt is not None:
+        kwargs["beta"] = bt
+_register_precompile_fake(gemm_out, gemm_tuned, rewrite=_rewrite_merge_alpha)
+_register_precompile_fake(gemm_add_out, gemm_tuned, rewrite=_rewrite_merge_alpha_beta)
+_register_precompile_fake(gemm_act_out, gemm_act_tuned)
+_register_precompile_fake(gemm_dact_out, gemm_dact_tuned)
+_register_precompile_fake(gemm_gated_out, gemm_gated_tuned)
+@gemm_symmetric_out.register_fake
+def gemm_symmetric_out_fake(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor,
+    C: Optional[Tensor] = None,
+    dynamic_scheduler: bool = False,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    from .cache_utils import COMPILE_ONLY
+    if not COMPILE_ONLY or isinstance(A.shape[0], torch.SymInt):
+        return
+    # gemm_symmetric is not autotuned, compile the single fixed config directly
+    sm = get_device_capacity(A.device)[0]
+    tile_m = 256 if sm == 10 else 128
+    tile_n = 128 if sm == 12 else 256
+    cluster_m = 1 if sm == 12 else 2
+    try:
+        gemm_symmetric_dispatch(
+            A.unsqueeze(0) if A.ndim == 2 else A,
+            (B.mT.unsqueeze(0) if B.ndim == 2 else B.mT),
+            out.unsqueeze(0) if out.ndim == 2 else out,
+            (C.unsqueeze(0) if C.ndim == 2 else C) if C is not None else None,
+            torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None,
+            tile_M=tile_m,
+            tile_N=tile_n,
+            cluster_M=cluster_m,
+            cluster_N=1,
+            pingpong=False,
+            persistent=True,
+            max_swizzle_size=8,
+            alpha=alpha,
+            beta=beta,
+        )
+    except Exception:
+        pass
+## ── gemm_rms ────────────────────────────────────────────────────────────────
+def _prune_gemm_rms_configs(configs, named_args: dict, **kwargs):
+    """ColVecReduce requires no swap_ab."""
+    configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
+    return prune_invalid_gemm_configs(configs, named_args | kwargs)
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    key=["dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": _prune_gemm_rms_configs},
+)
+def _gemm_rms_tuned(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, N) or (L, K, N)
+    out: Tensor,  # (M, N) or (L, M, N)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N)
+    norm_weight: Optional[Tensor] = None,  # (N,) or (L, N)
+    eps: float = 1e-6,
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+) -> Tensor:
+    if config is None:
+        config = default_config(A.device)
+    og_ndim_2 = A.ndim == 2
+    N = B.shape[-1]
+    if A.ndim == 2:
+        A = A.unsqueeze(0)
+    B = B.mT
+    if B.ndim == 2:
+        B = B.unsqueeze(0)
+    if out.ndim == 2:
+        out = out.unsqueeze(0)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)
+    if norm_weight is not None and norm_weight.ndim == 1:
+        norm_weight = norm_weight.unsqueeze(0)  # (L, N)
+    # Allocate partial reduction buffer
+    tile_n = config.tile_n
+    n_tiles = (N + tile_n - 1) // tile_n
+    colvec_reduce = torch.empty(
+        (A.shape[0], A.shape[1], n_tiles), dtype=torch.float32, device=A.device
+    )
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    gemm_sq_reduce_dispatch(
+        A,
+        B,
+        out,
+        C,
+        colvec_reduce,
+        tile_count_semaphore,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec=norm_weight,
+    )
+    # Final reduction: rstd = rsqrt(sum(partials) / N + eps)
+    scale = 1.0 / N
+    flat_reduce = colvec_reduce.reshape(-1, n_tiles)
+    rstd_flat = rms_final_reduce(flat_reduce, scale=scale, eps=eps)
+    rstd = rstd_flat.reshape(A.shape[:-1])
+    if og_ndim_2:
+        rstd = rstd.squeeze(0)
+    return rstd
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_rms_out"),
+    mutates_args=("out",),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a!) out, Tensor? C=None, Tensor? norm_weight=None, float eps=1e-6, bool dynamic_scheduler=False, bool tuned=True) -> Tensor",
+)
+def _gemm_rms_out(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor,
+    C: Optional[Tensor] = None,
+    norm_weight: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tensor:
+    """GEMM + RMS + optional rowvec scaling.
+    D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D_out = D_raw * norm_weight.
+    """
+    fn = _gemm_rms_tuned if tuned else partial(_gemm_rms_tuned.fn, config=None)
+    return fn(
+        A,
+        B,
+        out,
+        C=C,
+        norm_weight=norm_weight,
+        eps=eps,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+@torch.library.register_fake(add_quack_op_namespace_prefix("gemm_rms_out"))
+def _gemm_rms_out_fake(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor,
+    C: Optional[Tensor] = None,
+    norm_weight: Optional[Tensor] = None,
+    eps: float = 1e-6,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tensor:
+    _precompile_default_config(
+        _gemm_rms_tuned,
+        A,
+        B,
+        out,
+        C=C,
+        norm_weight=norm_weight,
+        eps=eps,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+    rstd_shape = A.shape[:-1]
+    return torch.empty(rstd_shape, dtype=torch.float32, device=A.device)
+def gemm_rms_ref(
+    A: Tensor,
+    B: Tensor,
+    C: Optional[Tensor] = None,
+    norm_weight: Optional[Tensor] = None,
+    eps: float = 1e-6,
+) -> Tuple[Tensor, Tensor]:
+    """Reference: D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D = D_raw * norm_weight."""
+    fn = torch.bmm if A.ndim == 3 else torch.mm
+    D = fn(A, B)
+    if C is not None:
+        D = D + C
+    rstd = torch.rsqrt(D.float().square().mean(dim=-1) + eps)
+    if norm_weight is not None:
+        D = D * norm_weight
+    return D, rstd
+def gemm_rms(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, N) or (L, K, N)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N)
+    norm_weight: Optional[Tensor] = None,  # (N,) or (L, N)
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N)
+    out_dtype: Optional[torch.dtype] = None,
+    eps: float = 1e-6,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    """GEMM + RMS statistics + optional rowvec scaling.
+    D_raw = A @ B (+ C), rstd = rsqrt(mean(D_raw^2) + eps), D_out = D_raw * norm_weight.
+    Returns (D_out, rstd).
+    """
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    N = B.shape[-1]
+    if out is None:
+        out_shape = (*A.shape[:-1], N)
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    rstd = _gemm_rms_out(
+        A,
+        B,
+        out,
+        C=C,
+        norm_weight=norm_weight,
+        eps=eps,
+        dynamic_scheduler=dynamic_scheduler,
+        tuned=tuned,
+    )
+    return out, rstd
+## ── gemm_norm_act ─────────────────────────────────────────────────────────────
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs()],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_norm_act_tuned(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, N) or (L, K, N)
+    preact_out: Optional[Tensor],  # (M, N) or (L, M, N) — None if not storing preact
+    postact_out: Tensor,  # (M, N) or (L, M, N)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N)
+    rstd: Optional[Tensor] = None,  # (M,) or (L, M)
+    activation: ActActivation = None,
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    if A.ndim == 2:
+        A = A.unsqueeze(0)
+    B = B.mT
+    if B.ndim == 2:
+        B = B.unsqueeze(0)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)
+    if preact_out is not None and preact_out.ndim == 2:
+        D = preact_out.unsqueeze(0)
+    else:
+        D = preact_out
+    if postact_out.ndim == 2:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if rstd is not None and rstd.ndim == 1:
+        rstd = rstd.unsqueeze(0)  # (L, M)
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    gemm_norm_act_dispatch(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        (D if not config.swap_ab else D.mT) if D is not None else None,
+        (C if not config.swap_ab else C.mT) if C is not None else None,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
+        max_swizzle_size=config.max_swizzle_size,
+        colvec=rstd if not config.swap_ab else None,
+        rowvec=rstd if config.swap_ab else None,
+    )
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs("gated")],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_norm_gated_tuned(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, N) or (L, K, N)
+    preact_out: Optional[Tensor],  # (M, N) or (L, M, N)
+    postact_out: Tensor,  # (M, N//2) or (L, M, N//2)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N)
+    rstd: Optional[Tensor] = None,  # (M,) or (L, M)
+    activation: GatedActivation = "swiglu",
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    if A.ndim == 2:
+        A = A.unsqueeze(0)
+    B = B.mT
+    if B.ndim == 2:
+        B = B.unsqueeze(0)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)
+    if preact_out is not None and preact_out.ndim == 2:
+        D = preact_out.unsqueeze(0)
+    else:
+        D = preact_out
+    if postact_out.ndim == 2:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if rstd is not None and rstd.ndim == 1:
+        rstd = rstd.unsqueeze(0)  # (L, M)
+    dynamic_scheduler = dynamic_scheduler or config.is_dynamic_persistent
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_scheduler and get_device_capacity(A.device)[0] == 9
+        else None
+    )
+    gemm_norm_act_dispatch(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        (D if not config.swap_ab else D.mT) if D is not None else None,
+        (C if not config.swap_ab else C.mT) if C is not None else None,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        is_dynamic_persistent=dynamic_scheduler,
+        max_swizzle_size=config.max_swizzle_size,
+        colvec=rstd if not config.swap_ab else None,
+        rowvec=rstd if config.swap_ab else None,
+    )
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_norm_act_out"),
+    mutates_args=("preact_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? rstd=None, str? activation=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_norm_act_out(
+    A: Tensor,
+    B: Tensor,
+    preact_out: Optional[Tensor],
+    postact_out: Tensor,
+    C: Optional[Tensor] = None,
+    rstd: Optional[Tensor] = None,
+    activation: ActActivation = None,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    fn = gemm_norm_act_tuned if tuned else partial(gemm_norm_act_tuned.fn, config=None)
+    fn(A, B, preact_out, postact_out, C, rstd, activation, dynamic_scheduler)
+@torch.library.register_fake(add_quack_op_namespace_prefix("gemm_norm_act_out"))
+def _gemm_norm_act_out_fake(
+    A,
+    B,
+    preact_out,
+    postact_out,
+    C=None,
+    rstd=None,
+    activation=None,
+    dynamic_scheduler=False,
+    tuned=True,
+) -> None:
+    pass
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_norm_gated_out"),
+    mutates_args=("preact_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? rstd=None, str activation='swiglu', bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_norm_gated_out(
+    A: Tensor,
+    B: Tensor,
+    preact_out: Optional[Tensor],
+    postact_out: Tensor,
+    C: Optional[Tensor] = None,
+    rstd: Optional[Tensor] = None,
+    activation: GatedActivation = "swiglu",
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    fn = gemm_norm_gated_tuned if tuned else partial(gemm_norm_gated_tuned.fn, config=None)
+    fn(A, B, preact_out, postact_out, C, rstd, activation, dynamic_scheduler)
+@torch.library.register_fake(add_quack_op_namespace_prefix("gemm_norm_gated_out"))
+def _gemm_norm_gated_out_fake(
+    A,
+    B,
+    preact_out,
+    postact_out,
+    C=None,
+    rstd=None,
+    activation="swiglu",
+    dynamic_scheduler=False,
+    tuned=True,
+) -> None:
+    pass
+def gemm_norm_act(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, N) or (L, K, N)
+    rstd: Optional[Tensor] = None,  # (M,) or (L, M)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) — residual
+    activation: Activation = None,
+    preact_out: Optional[Tensor] = None,
+    postact_out: Optional[Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+    store_preact: bool = False,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM + normalize + activation: PostAct = act((A @ B + C) * rstd).
+    rstd is a column vector (M,).
+    Returns (preact, postact) where preact is the normalized value before activation.
+    """
+    is_gated = activation in gated_to_pytorch_fn_map
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    if A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    postact_shape = (*out_shape[:-1], out_shape[-1] // 2) if is_gated else out_shape
+    if preact_out is None and store_preact:
+        preact_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    if postact_out is None:
+        postact_out = torch.empty(postact_shape, dtype=postact_dtype, device=A.device)
+    if is_gated:
+        gemm_norm_gated_out(
+            A,
+            B,
+            preact_out,
+            postact_out,
+            C,
+            rstd,
+            activation,
+            dynamic_scheduler,
+            tuned,
+        )
+    else:
+        gemm_norm_act_out(
+            A,
+            B,
+            preact_out,
+            postact_out,
+            C,
+            rstd,
+            activation,
+            dynamic_scheduler,
+            tuned,
+        )
+    return preact_out, postact_out
+gemm_norm_gated = gemm_norm_act
+def gemm_norm_act_ref(
+    A: Tensor,
+    B: Tensor,
+    rstd: Optional[Tensor] = None,  # (M,) or (L, M)
+    C: Optional[Tensor] = None,
+    activation: Activation = None,
+    store_preact: bool = False,
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """Reference: preact = (A @ B + C) * rstd, postact = act(preact)."""
+    is_gated = activation in gated_to_pytorch_fn_map
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    fn = torch.bmm if A.ndim == 3 else torch.mm
+    D = fn(A, B)
+    if C is not None:
+        D = D + C
+    if rstd is not None:
+        D = D * rstd.unsqueeze(-1)
+    preact = D.to(out_dtype) if store_preact else None
+    _act_map = {**act_to_pytorch_fn_map, "silu": F.silu}
+    if is_gated:
+        gate = D[..., ::2]
+        up = D[..., 1::2]
+        postact = gated_to_pytorch_fn_map[activation](gate, up).to(postact_dtype)
+    else:
+        postact = _act_map[activation](D).to(postact_dtype)
+    return preact, postact
+gemm_norm_gated_ref = gemm_norm_act_ref
 # TODO: this is not quite right, do we need to register gemm_add not gemm_add_out?
 # try:
 #     from torch._inductor.fx_passes.reinplace import InplaceableOp

build/torch-cuda/quack/gemm_norm_act.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# Copyright (c) 2025-2026, Tri Dao.
+# GEMM + normalize (multiply by colvec and rowvec) + activation:
+# PostAct = act((A @ B + C) * colvec * rowvec)
+# colvec is typically rstd (M,), rowvec is typically norm_weight (N,).
+from typing import Optional, Tuple
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+from cutlass.cute.runtime import make_ptr
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import (
+    torch2cute_dtype_map,
+    get_device_capacity,
+    get_max_active_clusters,
+)
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
+from .gemm_act import GemmActMixin, GemmGatedMixin
+from .epi_ops import vec_multiply
+from .activation import act_fn_map, gate_fn_map
+from .cache_utils import jit_cache
+from .rounding import RoundingMode
+from .gemm_tvm_ffi_utils import (
+    get_major,
+    perm3d_single,
+    make_scheduler_args,
+    make_varlen_args,
+    make_fake_scheduler_args,
+    make_fake_varlen_args,
+    div_for_dtype,
+    make_fake_gemm_tensors,
+    compile_gemm_kernel,
+)
+from . import utils as utils
+class GemmNormActMixin(GemmActMixin):
+    """GEMM + normalize + activation: PostAct = act((A @ B + C) * colvec * rowvec).
+    colvec is typically rstd (M,), rowvec is typically norm_weight (N,).
+    D stores the normalized (pre-activation) value, PostAct stores act(D).
+    """
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        tDrRowVec = epi_loop_tensors["mRowVecBroadcast"]
+        tDrColVec = epi_loop_tensors["mColVecBroadcast"]
+        # Load accumulator and apply alpha/beta/C
+        rD = tRS_rD.load()
+        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
+            alpha = utils.load_scalar_or_pointer(params.alpha)
+            rD *= alpha
+        if const_expr(tRS_rC is not None):
+            if const_expr(not hasattr(params, "beta") or params.beta is None):
+                rD += tRS_rC.load().to(tRS_rD.element_type)
+            else:
+                beta = utils.load_scalar_or_pointer(params.beta)
+                rD += beta * tRS_rC.load().to(tRS_rD.element_type)
+        tRS_rD.store(rD)
+        # Multiply by colvec (rstd) and rowvec (norm_weight)
+        vec_multiply(self, tRS_rD, tDrColVec, tDrRowVec)
+        # Apply activation
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_rmem_tensor(tRS_rD.layout.shape, self.acc_dtype)
+            if const_expr(self.arch < 100):
+                for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                    tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
+            else:
+                for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                    tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1] = params.act_fn(
+                        (tRS_rD[2 * i], tRS_rD[2 * i + 1])
+                    )
+        else:
+            tRS_rPostAct = tRS_rD
+        return tRS_rPostAct
+class GemmNormActSm90(GemmNormActMixin, GemmSm90):
+    pass
+class GemmNormActSm100(GemmNormActMixin, GemmSm100):
+    pass
+class GemmNormActSm120(GemmNormActMixin, GemmSm120):
+    pass
+class GemmNormGatedMixin(GemmGatedMixin):
+    """GEMM + normalize + gated activation: PostAct = gated_act((A @ B + C) * colvec * rowvec)."""
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        tDrRowVec = epi_loop_tensors["mRowVecBroadcast"]
+        tDrColVec = epi_loop_tensors["mColVecBroadcast"]
+        # Load accumulator and apply alpha/beta/C
+        rD = tRS_rD.load()
+        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
+            alpha = utils.load_scalar_or_pointer(params.alpha)
+            rD *= alpha
+        if const_expr(tRS_rC is not None):
+            if const_expr(not hasattr(params, "beta") or params.beta is None):
+                rD += tRS_rC.load().to(tRS_rD.element_type)
+            else:
+                beta = utils.load_scalar_or_pointer(params.beta)
+                rD += beta * tRS_rC.load().to(tRS_rD.element_type)
+        tRS_rD.store(rD)
+        # Multiply by colvec (rstd) and rowvec (norm_weight)
+        vec_multiply(self, tRS_rD, tDrColVec, tDrRowVec)
+        # Gated activation on normalized D
+        tRS_rPostAct_layout = cute.recast_layout(2, 1, tRS_rD.layout)
+        tRS_rPostAct = cute.make_rmem_tensor(tRS_rPostAct_layout.shape, self.acc_dtype)
+        if const_expr(self.arch < 100):
+            for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                tRS_rPostAct[i] = params.act_fn(tRS_rD[2 * i], tRS_rD[2 * i + 1])
+        else:
+            for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1] = params.act_fn(
+                    (tRS_rD[4 * i], tRS_rD[4 * i + 2]),
+                    (tRS_rD[4 * i + 1], tRS_rD[4 * i + 3]),
+                )
+        return tRS_rPostAct
+class GemmNormGatedSm90(GemmNormGatedMixin, GemmSm90):
+    pass
+class GemmNormGatedSm100(GemmNormGatedMixin, GemmSm100):
+    pass
+class GemmNormGatedSm120(GemmNormGatedMixin, GemmSm120):
+    pass
+@jit_cache
+def _compile_gemm_norm_act(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    postact_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    postact_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    activation,
+    rowvec_dtype,
+    colvec_dtype,
+    colvec_ndim,
+    varlen_m,
+    gather_A,
+    device_capacity,
+    gemm_cls_name,
+    rounding_mode=RoundingMode.RN,
+    sr_seed_mode=0,
+):
+    sm_to_cls = {
+        "norm_act": {
+            9: GemmNormActSm90,
+            10: GemmNormActSm100,
+            11: GemmNormActSm100,
+            12: GemmNormActSm120,
+        },
+        "norm_gated": {
+            9: GemmNormGatedSm90,
+            10: GemmNormGatedSm100,
+            11: GemmNormGatedSm100,
+            12: GemmNormGatedSm120,
+        },
+    }
+    GemmCls = sm_to_cls[gemm_cls_name][device_capacity[0]]
+    pa_leading = 1 if postact_major == "n" else 0
+    mA, mB, mD, mC, m, n, k, l = make_fake_gemm_tensors(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        varlen_m=varlen_m,
+        gather_A=gather_A,
+    )
+    div_pa = div_for_dtype(postact_dtype)
+    pa_n = cute.sym_int() if gemm_cls_name == "norm_gated" else n
+    pa_leading_dim = 1 if gemm_cls_name == "norm_gated" else pa_leading
+    pa_shape = (m, pa_n) if varlen_m else (m, pa_n, l)
+    mPostAct = fake_tensor(postact_dtype, pa_shape, leading_dim=pa_leading_dim, divisibility=div_pa)
+    mRowVec = fake_tensor(rowvec_dtype, (l, n), leading_dim=1, divisibility=4)
+    if colvec_ndim == 2:
+        mColVec = fake_tensor(colvec_dtype, (l, m), leading_dim=1, divisibility=4)
+    elif colvec_ndim == 1:
+        mColVec = fake_tensor(colvec_dtype, (m,), leading_dim=0, divisibility=4)
+    else:
+        mColVec = None
+    act_fn = act_fn_map[activation] if gemm_cls_name == "norm_act" else gate_fn_map[activation]
+    def fake_scalar(mode, dtype=Int32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(0)
+        else:
+            return make_ptr(dtype, 0, cute.AddressSpace.gmem, assumed_align=4)
+    epi_args = GemmCls.EpilogueArguments(
+        mPostAct,
+        act_fn,
+        mRowVecBroadcast=mRowVec,
+        mColVecBroadcast=mColVec,
+        rounding_mode=rounding_mode,
+        sr_seed=fake_scalar(sr_seed_mode),
+    )
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), False, l
+    )
+    varlen_args = make_fake_varlen_args(varlen_m, False, gather_A, m if varlen_m else None)
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        gather_A,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+    )
+def gemm_norm_act_fn(
+    A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m
+    B: Tensor,  # (l, n, k)
+    D: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    PostAct: Tensor,  # (l, m, n) or (total_m, n//2) if gated
+    tile_count_semaphore: Optional[Tensor],
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    is_dynamic_persistent: bool = False,
+    max_swizzle_size: int = 8,
+    rowvec: Optional[Tensor] = None,  # (l, n) — norm_weight
+    colvec: Optional[Tensor] = None,  # (l, m) or (total_m,) — rstd
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,
+    rounding_mode: int = RoundingMode.RN,
+    sr_seed: int | Tensor = 0,
+) -> None:
+    if activation in gate_fn_map:
+        gemm_cls_name = "norm_gated"
+    else:
+        assert activation in act_fn_map, f"Unsupported activation {activation}"
+        gemm_cls_name = "norm_act"
+    varlen_m = cu_seqlens_m is not None
+    gather_A = A_idx is not None
+    if varlen_m:
+        assert persistent, "varlen_m requires persistent=True"
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        if D is not None:
+            assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+        assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
+    if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    A_p = perm3d_single(A, varlen_m)
+    B_p = perm3d_single(B)
+    D_p = perm3d_single(D, varlen_m)
+    C_p = perm3d_single(C, varlen_m)
+    PostAct_p = perm3d_single(PostAct, varlen_m)
+    a_major = get_major(A_p, "m", "k")
+    b_major = get_major(B_p, "n", "k")
+    d_major = get_major(D_p, "m", "n") if D_p is not None else None
+    c_major = get_major(C_p, "m", "n") if C_p is not None else None
+    postact_major = get_major(PostAct_p, "m", "n")
+    a_dtype = torch2cute_dtype_map[A.dtype]
+    b_dtype = torch2cute_dtype_map[B.dtype]
+    d_dtype = torch2cute_dtype_map[D.dtype] if D is not None else None
+    c_dtype = torch2cute_dtype_map[C.dtype] if C is not None else None
+    postact_dtype = torch2cute_dtype_map[PostAct.dtype]
+    colvec_ndim = colvec.ndim if colvec is not None else 0
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if rounding_mode == RoundingMode.RS:
+        assert device_capacity[0] == 10, "Stochastic rounding requires SM100"
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
+        )
+    sr_seed_mode = (
+        2 if isinstance(sr_seed, Tensor) else (1 if rounding_mode == RoundingMode.RS else 0)
+    )
+    compiled_fn = _compile_gemm_norm_act(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        postact_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        postact_major,
+        (tile_M, tile_N),
+        (cluster_M, cluster_N, 1),
+        pingpong,
+        persistent,
+        is_dynamic_persistent,
+        activation,
+        torch2cute_dtype_map[rowvec.dtype] if rowvec is not None else None,
+        torch2cute_dtype_map[colvec.dtype] if colvec is not None else None,
+        colvec_ndim,
+        varlen_m,
+        gather_A,
+        device_capacity,
+        gemm_cls_name,
+        rounding_mode=rounding_mode,
+        sr_seed_mode=sr_seed_mode,
+    )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    def scalar_arg(scalar, mode, dtype=Int32):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return dtype(scalar)
+        else:
+            return scalar.data_ptr()
+    epi_args = GemmActMixin.EpilogueArguments(
+        PostAct_p,
+        None,  # act_fn is Constexpr, pass None at call time
+        mRowVecBroadcast=rowvec,
+        mColVecBroadcast=colvec,
+        rounding_mode=None,
+        sr_seed=scalar_arg(sr_seed, sr_seed_mode),
+    )
+    scheduler_args = make_scheduler_args(
+        max_active_clusters, max_swizzle_size, tile_count_semaphore
+    )
+    varlen_args = make_varlen_args(cu_seqlens_m, None, A_idx)
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None, None, None)
+    else:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None)

build/torch-cuda/quack/gemm_sm100.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/quack/gemm_sm120.py ADDED Viewed

	@@ -0,0 +1,626 @@

+# Copyright (c) 2025-2026, Tri Dao.
+# Based on the cute-dsl example:
+# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell_geforce/dense_gemm.py
+# SM120-style GEMM using warp-level MMA (MmaF16BF16Op) + ldmatrix.
+# Unlike SM90 WGMMA (which reads A/B from SMEM directly), warp-level MMA
+# requires explicit SMEM→RMEM copies via ldmatrix before each MMA instruction.
+# This is a work in progress and not very optimized.
+import math
+from typing import Tuple, Type, Callable, Optional
+from functools import partial
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
+from cutlass.cute.nvgpu import cpasync, warp
+from cutlass import Int32, Boolean, const_expr
+from .varlen_utils import VarlenManager
+from .pipeline import make_pipeline_state
+from . import copy_utils
+from .gemm_sm90 import GemmSm90, NamedBarrierGemm
+from . import sm80_utils
+class GemmSm120(GemmSm90):
+    """SM120-style GEMM using warp-level MMA instead of WGMMA.
+    Key differences from SM90:
+    - Uses MmaF16BF16Op (warp-level, 32 threads) instead of WGMMA (warp-group, 128 threads)
+    - Requires explicit SMEM→RMEM copy via ldmatrix before MMA
+    - Thread config: num_mma_warps regular warps + 1 DMA warp
+    - Pingpong: 2 warp groups of (2,2,1), each processing alternating tiles
+    - No fp8 support (warp-level MMA only supports fp16/bf16)
+    """
+    arch = 120
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        a_dtype: Type[cutlass.Numeric],
+        tile_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
+        pingpong: bool = False,
+        is_persistent: bool = True,
+        gather_A: bool = False,
+        use_pdl: bool = True,
+    ):
+        # Don't call super().__init__ — we set up our own config
+        self.acc_dtype = acc_dtype
+        self.pingpong = pingpong
+        self.is_persistent = is_persistent
+        self.use_clc_persistence = False
+        self.use_pdl = use_pdl
+        self.fp8_slow_accum = False
+        self.gather_A = gather_A
+        if self.pingpong:
+            assert self.is_persistent, "Pingpong gemm requires persistent scheduler"
+        if gather_A:
+            assert cluster_shape_mnk[1] == 1
+        self.cluster_shape_mnk = cluster_shape_mnk
+        tile_M, tile_N = tile_shape_mn
+        self.cta_tile_shape_mnk = (tile_M, tile_N, 1)
+        # Pingpong: 2 warp groups each with (2,2,1) atom layout
+        # Non-pingpong: 1 group of 8 warps with (4,2,1) atom layout
+        self.mma_inst_mnk = (16, 8, 16)
+        if not self.pingpong:
+            self.atom_layout_mnk = (4, 2, 1)
+        else:
+            self.atom_layout_mnk = (2, 2, 1)
+        # num_mma_warps = total warps doing MMA (both warp groups in pingpong)
+        self.num_mma_warps = math.prod(self.atom_layout_mnk) * (1 if not self.pingpong else 2)
+        # For compatibility with SM90 code that uses warp groups
+        self.num_threads_per_warp_group = 128
+        assert self.num_mma_warps % 4 == 0
+        self.mma_warp_groups = self.num_mma_warps // 4
+        if self.pingpong:
+            assert self.mma_warp_groups == 2
+        # threads_per_cta must be a multiple of 128 (warp group size) so that
+        # the DMA warp's setmaxnreg.dec.sync has a complete warp group to sync with.
+        self.threads_per_cta = (self.mma_warp_groups + 1) * self.num_threads_per_warp_group
+        self.num_mcast_ctas_a = cluster_shape_mnk[1]
+        if gather_A:
+            assert self.num_mcast_ctas_a == 1
+        self.num_mcast_ctas_b = cluster_shape_mnk[0]
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.occupancy = 1
+        self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}")
+        # In pingpong, only 1 warp group (4 warps) participates in epilogue at a time
+        self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
+        self.epilogue_barrier = pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierGemm.Epilogue),
+            num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
+        )
+        self.num_ab_load_warps = 1 if not self.gather_A else 4
+        self.ab_load_warp_id = self.num_mma_warps
+        if not self.gather_A:
+            self.num_regs_load = 40
+            self.num_regs_mma = 232
+        else:
+            self.num_regs_load = 56
+            self.num_regs_mma = 224
+        self.ab_stage = None
+        self.epi_stage = None
+        self.a_smem_layout_staged = None
+        self.b_smem_layout_staged = None
+        self.epi_smem_layout_staged = None
+        self.epi_tile = None
+        self.shared_storage = None
+        self.buffer_align_bytes = 1024
+    def _setup_tiled_mma(self):
+        """Set up warp-level MMA (MmaF16BF16Op) and tile K dimension."""
+        op = warp.MmaF16BF16Op(self.a_dtype, self.acc_dtype, self.mma_inst_mnk)
+        tC = cute.make_layout(self.atom_layout_mnk)
+        permutation_mnk = (
+            self.atom_layout_mnk[0] * self.mma_inst_mnk[0],
+            self.atom_layout_mnk[1] * self.mma_inst_mnk[1] * 2,
+            self.atom_layout_mnk[2] * self.mma_inst_mnk[2],
+        )
+        self.tiled_mma = cute.make_tiled_mma(op, tC, permutation_mnk=permutation_mnk)
+        tile_k = self.mma_inst_mnk[2] * 4
+        self.cta_tile_shape_mnk = (
+            self.cta_tile_shape_mnk[0],
+            self.cta_tile_shape_mnk[1],
+            tile_k,
+        )
+    # __call__, _setup_attributes, make_ab_pipeline, make_epi_store_pipeline,
+    # make_sched_pipeline, epilogue are all inherited from GemmSm90.
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: Optional[cute.CopyAtom],
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_d: Optional[cute.CopyAtom],
+        mD_mnl: Optional[cute.Tensor],
+        tma_atom_c: Optional[cute.CopyAtom],
+        mC_mnl: Optional[cute.Tensor],
+        epilogue_params,
+        varlen_params: VarlenManager.Params,
+        cluster_layout_mnk: cute.Layout,
+        a_smem_layout: cute.ComposedLayout,
+        b_smem_layout: cute.ComposedLayout,
+        epi_smem_layout: cute.ComposedLayout,
+        epi_c_smem_layout: cute.ComposedLayout,
+        tile_sched_params,
+        TileSchedulerCls: cutlass.Constexpr[Callable],
+        trace_ptr: Optional[cutlass.Int64] = None,
+    ):
+        from .trace import TraceContext
+        tctx = TraceContext.create(trace_ptr)
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
+        if const_expr(self.gather_A):
+            assert varlen_m or varlen_k
+        has_D = const_expr(mD_mnl is not None)
+        has_C = const_expr(mC_mnl is not None)
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # Prefetch TMA descriptors
+        if warp_idx == self.ab_load_warp_id:
+            for tma_atom in (tma_atom_a, tma_atom_b, tma_atom_d, tma_atom_c):
+                if const_expr(tma_atom is not None):
+                    cpasync.prefetch_descriptor(tma_atom)
+        # Allocate shared memory
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+        ab_pipeline = self.make_ab_pipeline(
+            tiled_mma=tiled_mma,
+            cluster_layout_vmnk=cute.make_layout((1, *cluster_layout_mnk.shape)),
+            ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
+        )
+        epi_pipeline = None
+        if const_expr(has_C):
+            epi_pipeline = self.make_epi_pipeline(
+                c_smem_layout=cute.slice_(epi_c_smem_layout, (None, None, 0)),
+                epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
+            )
+        sched_pipeline = None
+        sched_data = None
+        if const_expr(self.is_persistent):
+            sched_pipeline = self.make_sched_pipeline(
+                cluster_layout_mnk,
+                sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
+                varlen_k=varlen_k,
+            )
+            sched_data = storage.sched_data.get_tensor((4, self.sched_stage))
+        # Cluster sync
+        pipeline_init_arrive(cluster_shape_mn=self.cluster_shape_mnk[:-1], is_relaxed=True)
+        # SMEM tensors
+        sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
+        sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
+        sD = None
+        if const_expr(has_D):
+            sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
+        sC = None
+        if const_expr(has_C):
+            sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
+        epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            len_m_static=Int32(
+                cute.size(mA_mkl, mode=[0])
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(cute.size(mA_mkl, mode=[1])),
+        )
+        TileSchedulerCls = partial(
+            TileSchedulerCls.create, tile_sched_params, sched_data, sched_pipeline
+        )
+        # Cluster wait
+        pipeline_init_wait(cluster_shape_mn=self.cluster_shape_mnk[:-1])
+        if warp_idx >= self.ab_load_warp_id:
+            cute.arch.setmaxregister_decrease(self.num_regs_load)
+            if (
+                warp_idx >= self.ab_load_warp_id
+                and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
+            ):
+                # Get mcast mask
+                cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+                block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
+                a_mcast_mask = cute.make_layout_image_mask(
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=1
+                )
+                b_mcast_mask = cute.make_layout_image_mask(
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=0
+                )
+                a_mcast_mask = a_mcast_mask if self.is_a_mcast else 0
+                b_mcast_mask = b_mcast_mask if self.is_b_mcast else 0
+                # Persistent tile scheduling loop
+                is_scheduler_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
+                if const_expr(cute.size(cluster_layout_mnk) > 1):
+                    is_scheduler_warp = is_scheduler_warp and cute.arch.block_idx_in_cluster() == 0
+                tile_scheduler = TileSchedulerCls()
+                work_tile = tile_scheduler.initial_work_tile_info()
+                ab_producer_state = make_pipeline_state(
+                    pipeline.PipelineUserType.Producer, self.ab_stage
+                )
+                while work_tile.is_valid_tile:
+                    tctx.b("tma_load")
+                    tile_coord_mnkl = work_tile.tile_idx
+                    batch_idx = tile_coord_mnkl[3]
+                    # Local_tile partition global tensors
+                    copy_A, prefetch_A = None, None
+                    if const_expr(not self.gather_A):
+                        mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
+                        # (bM, bK, RestK)
+                        gA_mk = cute.local_tile(
+                            mA_mk,
+                            cute.select(self.cta_tile_shape_mnk, [0, 2]),
+                            (tile_coord_mnkl[0], None),
+                        )
+                        #  TMA load A partition_S/D
+                        copy_A, _, _ = copy_utils.tma_get_copy_fn(
+                            tma_atom_a,
+                            cta_coord=block_in_cluster_coord_mnk[1],
+                            cta_layout=cute.make_layout(
+                                cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
+                            ),
+                            src_tensor=gA_mk,
+                            dst_tensor=sA,
+                            mcast_mask=a_mcast_mask,
+                        )
+                    else:
+                        copy_A, prefetch_A = self._make_gather_A_copy(
+                            mA_mkl, sA, varlen_manager, tile_coord_mnkl, batch_idx
+                        )
+                    # (bN, bK, RestK)
+                    gB_nk = cute.local_tile(
+                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                        cute.select(self.cta_tile_shape_mnk, [1, 2]),
+                        (tile_coord_mnkl[1], None),
+                    )
+                    # TMA load B partition_S/D
+                    copy_B, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_b,
+                        cta_coord=block_in_cluster_coord_mnk[0],
+                        cta_layout=cute.make_layout(
+                            cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
+                        ),
+                        src_tensor=gB_nk,
+                        dst_tensor=sB,
+                        mcast_mask=b_mcast_mask,
+                    )
+                    len_k = varlen_manager.len_k(batch_idx)
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                    if const_expr(not self.gather_A):
+                        ab_producer_state = self.load_AB(
+                            ab_pipeline, ab_producer_state, copy_A, copy_B, k_tile_cnt
+                        )
+                    else:
+                        ab_producer_state = self.load_AB_gather_A(
+                            ab_pipeline,
+                            ab_producer_state,
+                            copy_A,
+                            prefetch_A,
+                            copy_B,
+                            k_tile_cnt,
+                            varlen_m=varlen_m,
+                        )
+                    tctx.e("tma_load")
+                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
+                    work_tile = tile_scheduler.get_current_work()
+                    # End of persistent scheduler loop
+                if const_expr(self.pingpong and not varlen_k):
+                    # Need to write the tile_idx to smem for the next WG in the pingpong mode
+                    if is_scheduler_warp:
+                        tile_scheduler.write_work_tile_to_smem(work_tile)
+                    work_tile = tile_scheduler.get_current_work()
+                ab_pipeline.producer_tail(ab_producer_state)
+                if is_scheduler_warp:
+                    tile_scheduler.producer_tail()
+        # =====================================================================
+        # MMA warps
+        # =====================================================================
+        if warp_idx < self.num_mma_warps:
+            cute.arch.setmaxregister_increase(self.num_regs_mma)
+            is_tma_warp = Boolean(
+                (not self.pingpong and warp_idx == 0)
+                or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
+            )
+            tidx, _, _ = cute.arch.thread_idx()
+            # For pingpong, adjust tidx to within-warp-group index
+            warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
+            if const_expr(self.pingpong):
+                tidx = tidx % self.num_threads_per_warp_group
+            # ldmatrix copy atoms for SMEM → RMEM
+            atom_copy_ldmatrix_A = cute.make_copy_atom(
+                warp.LdMatrix8x8x16bOp(self.a_layout.is_m_major_a(), 4),
+                self.a_dtype,
+            )
+            atom_copy_ldmatrix_B = cute.make_copy_atom(
+                warp.LdMatrix8x8x16bOp(self.b_layout.is_n_major_b(), 4),
+                self.b_dtype,
+            )
+            smem_tiled_copy_A = cute.make_tiled_copy_A(atom_copy_ldmatrix_A, tiled_mma)
+            smem_tiled_copy_B = cute.make_tiled_copy_B(atom_copy_ldmatrix_B, tiled_mma)
+            thr_copy_ldmatrix_A = smem_tiled_copy_A.get_slice(tidx)
+            thr_copy_ldmatrix_B = smem_tiled_copy_B.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            # Make fragments
+            thr_mma = tiled_mma.get_slice(tidx)
+            acc, tCsA, tCsB, tCrA, tCrB = sm80_utils.partition_fragment_ABC(
+                thr_mma, self.cta_tile_shape_mnk, sA, sB
+            )
+            if const_expr(self.pingpong):
+                if warp_group_idx == 0:
+                    # WG0 needs a start signal at the very beginning
+                    self.pingpong_barrier_arrive(warp_group_idx=0, stage="mma")
+                    self.pingpong_barrier_arrive(warp_group_idx=0, stage="epi")
+            k_tile_cnt_static = cute.ceil_div(
+                cute.size(mA_mkl, mode=[1]), self.cta_tile_shape_mnk[2]
+            )
+            c_tile_cnt = cute.size(cute.ceil_div(self.cta_tile_shape_mnk[:2], self.epi_tile))
+            ab_read_state = make_pipeline_state(pipeline.PipelineUserType.Consumer, self.ab_stage)
+            epi_store_pipeline = self.make_epi_store_pipeline()
+            epi_read_state = make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.epi_c_stage
+            )
+            epi_producer_state = make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.epi_c_stage
+            )
+            tile_scheduler = TileSchedulerCls()
+            work_tile = tile_scheduler.initial_work_tile_info()
+            if const_expr(self.pingpong):
+                if warp_idx >= 4:
+                    # Advance 2nd Math WG pipeline states to the end of 1st Math WG
+                    epi_read_state.advance_iters(c_tile_cnt)
+                    epi_producer_state.advance_iters(c_tile_cnt)
+                    if const_expr(not varlen_k):
+                        ab_read_state.advance_iters(k_tile_cnt_static)
+                    else:
+                        len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                        k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                        ab_read_state.advance_iters(k_tile_cnt)
+                    tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
+            while work_tile.is_valid_tile:
+                tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                len_k = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                acc.fill(0.0)
+                if const_expr(self.pingpong):
+                    self.pingpong_barrier_sync(warp_group_idx, stage="mma")
+                tctx.b("mma")
+                ab_read_state = self.mma(
+                    ab_pipeline,
+                    ab_read_state,
+                    tiled_mma,
+                    acc,
+                    k_tile_cnt,
+                    smem_tiled_copy_A,
+                    smem_tiled_copy_B,
+                    tCsA_copy_view,
+                    tCsB_copy_view,
+                    tCrA,
+                    tCrB,
+                )
+                if const_expr(self.pingpong):
+                    # Cue for next WG's MMA to start
+                    self.pingpong_barrier_arrive(1 - warp_group_idx, stage="mma")
+                tctx.e("mma")
+                # ============================================================
+                # EPILOGUE — reuse SM90's epilogue flow
+                # ============================================================
+                if const_expr(self.pingpong):
+                    self.pingpong_barrier_sync(warp_group_idx, "epi")
+                tctx.b("epilogue")
+                copy_D = None
+                if const_expr(has_D):
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_d,
+                        varlen_manager.offset_batch_epi(mD_mnl, tile_coord_mnkl[3]),
+                        self.cta_tile_shape_mnk[:2],
+                        self.epi_tile,
+                        sD,
+                        tile_coord_mnkl,
+                    )
+                copy_C = None
+                if const_expr(has_C):
+                    copy_C_fn, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_c,
+                        varlen_manager.offset_batch_epi(mC_mnl, tile_coord_mnkl[3]),
+                        self.cta_tile_shape_mnk[:2],
+                        self.epi_tile,
+                        sC,
+                        tile_coord_mnkl,
+                    )
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
+                d_dtype_for_layout = self.d_dtype if self.d_dtype is not None else cutlass.BFloat16
+                tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                    tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
+                )
+                tRS_rAcc = self.epi_retile_acc(acc, tRS_rD, tiled_copy_r2s, tidx)
+                load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
+                if const_expr(has_C):
+                    tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
+                        tiled_mma, self.c_layout, self.c_dtype, sC, tRS_rD.layout, tidx
+                    )
+                else:
+                    tiled_copy_s2r, tSR_sC, tRS_rC, tSR_rC = None, None, None, None
+                self.epi_visit_acc(epilogue_params, acc, tiled_mma, tile_coord_mnkl, tidx)
+                epi_read_state, epi_producer_state = self.epilogue(
+                    epilogue_params,
+                    epi_smem_tensors,
+                    epi_pipeline,
+                    epi_store_pipeline,
+                    epi_read_state,
+                    epi_producer_state,
+                    self.epi_tile,
+                    load_acc_subtile,
+                    tRS_rD,
+                    tRS_rC,
+                    None,  # tiled_copy_t2r, for Sm100 only
+                    tiled_copy_r2s,
+                    tRS_sD,
+                    tiled_copy_s2r,
+                    tSR_rC,
+                    tSR_sC,
+                    copy_D,
+                    copy_C,
+                    tile_coord_mnkl,
+                    varlen_manager,
+                    self.epilogue_barrier,
+                    tile_scheduler,
+                    tidx,
+                    is_tma_warp,
+                )
+                if const_expr(self.pingpong):
+                    # With pingpong, 2 WGs write two different output tiles to the same smem,
+                    # so we have to make sure the smem content is done reading before signaling
+                    # the next WG's epilogue.
+                    if is_tma_warp:
+                        epi_store_pipeline.producer_tail()
+                    self.pingpong_barrier_arrive(1 - warp_group_idx, stage="epi")
+                tctx.e("epilogue")
+                if const_expr(not self.pingpong):
+                    tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
+                else:  # Skip a tile for pingpong
+                    # Update starting load/store pipeline states for the next tile
+                    epi_read_state.advance_iters(c_tile_cnt)
+                    epi_producer_state.advance_iters(c_tile_cnt)
+                    # Update starting mainloop pipeline state for the next tile
+                    if const_expr(not varlen_k):
+                        ab_read_state.advance_iters(k_tile_cnt_static)
+                        tile_scheduler.advance_to_next_work(advance_count=self.mma_warp_groups)
+                        work_tile = tile_scheduler.get_current_work()
+                    else:
+                        tile_scheduler.advance_to_next_work()
+                        work_tile = tile_scheduler.get_current_work()
+                        if work_tile.is_valid_tile:
+                            len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                            k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                            ab_read_state.advance_iters(k_tile_cnt)
+                            tile_scheduler.advance_to_next_work()
+                            work_tile = tile_scheduler.get_current_work()
+            # Wait for D store complete
+            if const_expr(not self.pingpong):
+                if is_tma_warp:
+                    epi_store_pipeline.producer_tail()
+        tctx.flush()
+    @cute.jit
+    def mma(
+        self,
+        ab_pipeline: cutlass.pipeline.PipelineAsync,
+        ab_read_state: cutlass.pipeline.PipelineState,
+        tiled_mma: cute.TiledMma,
+        acc: cute.Tensor,
+        k_tile_cnt: Int32,
+        smem_tiled_copy_A: cute.TiledCopy,
+        smem_tiled_copy_B: cute.TiledCopy,
+        tCsA_copy_view: cute.Tensor,
+        tCsB_copy_view: cute.Tensor,
+        tCrA: cute.Tensor,
+        tCrB: cute.Tensor,
+    ) -> cutlass.pipeline.PipelineState:
+        """Warp-level MMA mainloop: ldmatrix SMEM→RMEM + warp MMA."""
+        tCrA_copy_view = smem_tiled_copy_A.retile(tCrA)
+        tCrB_copy_view = smem_tiled_copy_B.retile(tCrB)
+        load_sA = partial(cute.copy, smem_tiled_copy_A)
+        load_sB = partial(cute.copy, smem_tiled_copy_B)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        peek_ab_full_status = Boolean(True)
+        if 0 < k_tile_cnt:
+            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+        ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
+        # Load first k-block
+        tCsA_p = tCsA_copy_view[None, None, None, ab_read_state.index]
+        tCsB_p = tCsB_copy_view[None, None, None, ab_read_state.index]
+        load_sA(tCsA_p[None, None, 0], tCrA_copy_view[None, None, 0])
+        load_sB(tCsB_p[None, None, 0], tCrB_copy_view[None, None, 0])
+        for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+            for k in cutlass.range_constexpr(num_k_blocks):
+                k_next = 0 if k + 1 == num_k_blocks else k + 1
+                if const_expr(k == num_k_blocks - 1):
+                    # Don't need to sync_warp: the previous instruction was mma.sync from cute.gemm
+                    ab_pipeline.consumer_release(ab_read_state)
+                    ab_read_state.advance()
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+                    tCsA_p = tCsA_copy_view[None, None, None, ab_read_state.index]
+                    tCsB_p = tCsB_copy_view[None, None, None, ab_read_state.index]
+                    ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
+                load_sA(tCsA_p[None, None, k_next], tCrA_copy_view[None, None, k_next])
+                load_sB(tCsB_p[None, None, k_next], tCrB_copy_view[None, None, k_next])
+                cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+        # Last k-tile (hoisted)
+        if 0 < k_tile_cnt:
+            for k in cutlass.range_constexpr(num_k_blocks):
+                k_next = 0 if k + 1 == num_k_blocks else k + 1
+                if const_expr(k == num_k_blocks - 1):
+                    ab_pipeline.consumer_release(ab_read_state)
+                    ab_read_state.advance()
+                if const_expr(k_next > 0):
+                    load_sA(tCsA_p[None, None, k_next], tCrA_copy_view[None, None, k_next])
+                    load_sB(tCsB_p[None, None, k_next], tCrB_copy_view[None, None, k_next])
+                cute.gemm(tiled_mma, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+        return ab_read_state
+    def epi_retile_acc(self, acc, tRS_rD, tiled_copy_r2s, tidx=None):
+        """Retile accumulator for epilogue. Warp-level MMA uses tiled_copy_r2s.retile."""
+        if tidx is None:
+            tidx = cute.arch.thread_idx()[0]
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        self._epi_size_tRS_rD = cute.size(tRS_rD)
+        return thr_copy_r2s.retile(acc)
+    @cute.jit
+    def epi_load_acc_subtile(self, tRS_rAcc, tRS_rD, epi_idx):
+        """Load acc subtile using retile-based flat indexing (warp-level MMA layout)."""
+        size_rD = self._epi_size_tRS_rD
+        for i in cutlass.range_constexpr(size_rD):
+            tRS_rD[i] = tRS_rAcc[epi_idx * size_rD + i]

build/torch-cuda/quack/gemm_sm90.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Based on the cute-dsl example:
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/hopper/dense_gemm.py
@@ -12,20 +13,24 @@ import cuda.bindings.driver as cuda
 import cutlass
 import cutlass.cute as cute
 import cutlass.pipeline as pipeline
 from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 import cutlass.utils.hopper_helpers as sm90_utils
 from cutlass import Int32, Float32, Float16, Boolean, const_expr
-from cutlass.cutlass_dsl import if_generate
 from cutlass.utils import LayoutEnum
-from .cute_dsl_utils import ParamsBase, ArgumentsBase
 from .tile_scheduler import (
     TileSchedulerOptions,
     TileSchedulerArguments,
     TileScheduler,
     VarlenMTileSchedulerArguments,
     VarlenMTileScheduler,
 )
 from .varlen_utils import VarlenArguments, VarlenManager
@@ -33,6 +38,7 @@ from .varlen_utils import VarlenArguments, VarlenManager
 from .pipeline import make_pipeline_state, PipelineTmaCpAsync
 from . import copy_utils as copy_utils
 from . import sm90_utils as quack_sm90_utils
 """
 A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Hopper architecture
@@ -122,9 +128,11 @@ class GemmSm90:
     """
     arch = 90
-    num_epi_tensormaps: int = 0
-    EpilogueArguments = ArgumentsBase
     EpilogueParams = ParamsBase
     def __init__(
@@ -137,6 +145,9 @@ class GemmSm90:
         is_persistent: bool = True,
         fp8_fast_accum: bool = False,
         gather_A: bool = False,
     ):
         """
         Initializes the configuration for a Hopper dense GEMM kernel.
@@ -155,10 +166,15 @@ class GemmSm90:
         self.acc_dtype = acc_dtype
         self.pingpong = pingpong
         self.is_persistent = is_persistent
         if self.pingpong:
             assert self.is_persistent, "Pingpong gemm requires persistent scheduler"
         self.fp8_slow_accum = not fp8_fast_accum and a_dtype.width == 8
         self.gather_A = gather_A
         if gather_A:
             assert cluster_shape_mnk[1] == 1, "Cluster shape N must be 1 for gather A "
@@ -224,10 +240,12 @@ class GemmSm90:
         self.threads_per_cta = (self.mma_warp_groups + 1) * self.num_threads_per_warp_group
         self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_90")
         self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
         self.num_ab_load_warps = 1 if not self.gather_A else 4
         self.ab_load_warp_id = self.mma_warp_groups * 4
-        # self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
-        # self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
         regs_per_thread = math.prod(self.cta_tile_shape_mnk[:2]) // (
             math.prod(self.atom_layout_mnk) * self.num_threads_per_warp_group
@@ -259,20 +277,8 @@ class GemmSm90:
         self.shared_storage = None
         self.buffer_align_bytes = 1024
-    def _setup_attributes(self, epilogue_args: EpilogueArguments):
-        """Set up configurations that are dependent on GEMM inputs
-        This method configures various attributes based on the input tensor properties
-        (data types, leading dimensions) and kernel settings:
-        - Configuring tiled MMA
-        - Computing MMA/cluster/tile shapes
-        - Computing cluster layout
-        - Computing multicast CTAs for A/B
-        - Computing epilogue subtile
-        - Setting up A/B/C stage counts in shared memory
-        - Computing A/B/C shared memory layout
-        """
         self.tiled_mma = sm90_utils.make_trivial_tiled_mma(
             self.a_dtype,
             self.b_dtype,
@@ -305,6 +311,21 @@ class GemmSm90:
             mma_inst_shape_k * mma_inst_tile_k,
         )
         self.cluster_layout_mnk = cute.make_layout(self.cluster_shape_mnk)
         self.epi_tile = self._sm90_compute_tile_shape_or_override(
@@ -324,8 +345,6 @@ class GemmSm90:
             epilogue_args,
             cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
-            # epi_smem will reuse smem ab if not persistent.
-            overlap_sD_sA=not self.is_persistent,
         )
         self.sched_stage = 2 if self.pingpong else 1
@@ -357,10 +376,11 @@ class GemmSm90:
         mB: cute.Tensor,
         mD: Optional[cute.Tensor],
         mC: Optional[cute.Tensor],
-        epilogue_args: ArgumentsBase,
         scheduler_args: TileSchedulerOptions,
         varlen_args: Optional[VarlenArguments],
         stream: cuda.CUstream,
     ):
         """Execute the GEMM operation in steps:
         - Setup static attributes
@@ -379,6 +399,14 @@ class GemmSm90:
         :type stream: cuda.CUstream
         """
         # setup static attributes before smem/grid/tma computation
         self.a_dtype = mA.element_type
         self.b_dtype = mB.element_type
@@ -399,18 +427,8 @@ class GemmSm90:
         if const_expr(varlen_args is None):
             varlen_args = VarlenArguments()
         assert (varlen_args.mAIdx is not None) == self.gather_A
-        # Assume all strides are divisible by 128 bits except the last stride
-        new_stride = lambda t: tuple(
-            cute.assume(s, divby=128 // t.element_type.width) if not cute.is_static(s) else s
-            for s in t.stride
-        )
-        mA, mD = [
-            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
-            if t is not None
-            else None
-            for t in (mA, mD)
-        ]
         self._setup_attributes(epilogue_args)
@@ -419,13 +437,15 @@ class GemmSm90:
         tma_atom_a, tma_tensor_a = None, None
         if const_expr(not self.gather_A):
             tma_atom_a, tma_tensor_a = self._make_tma_atoms_and_tensors(
-                mA,
                 a_smem_layout,
                 (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
                 self.cluster_shape_mnk[1],
             )
         tma_atom_b, tma_tensor_b = self._make_tma_atoms_and_tensors(
-            mB,
             b_smem_layout,
             (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
             self.cluster_shape_mnk[0],
@@ -438,7 +458,13 @@ class GemmSm90:
         tma_atom_d, tma_tensor_d = None, None
         if const_expr(mD is not None):
             tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
-                mD,
                 self.epi_smem_layout_staged,
                 self.epi_tile,
                 op_type="store"
@@ -454,16 +480,16 @@ class GemmSm90:
         epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
         varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
-        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
-        tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
         tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
         grid = TileSchedulerCls.get_grid_shape(
             tile_sched_params, scheduler_args.max_active_clusters
         )
-        epi_smem_size = (
-            cute.cosize(self.epi_smem_layout_staged) if self.is_persistent and mD is not None else 0
-        )
         epi_c_smem_size = cute.cosize(self.epi_c_smem_layout_staged) if mC is not None else 0
         @cute.struct
@@ -471,7 +497,7 @@ class GemmSm90:
             ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
             epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
             sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
-            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
             sD: cute.struct.Align[
                 cute.struct.MemRange[
                     self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
@@ -516,12 +542,14 @@ class GemmSm90:
             self.epi_c_smem_layout_staged,
             tile_sched_params,
             TileSchedulerCls,
         ).launch(
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
             stream=stream,
             min_blocks_per_mp=1,
         )
         return
@@ -538,15 +566,16 @@ class GemmSm90:
         mD_mnl: Optional[cute.Tensor],
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
-        epilogue_params: ParamsBase,
         varlen_params: VarlenManager.Params,
         cluster_layout_mnk: cute.Layout,
         a_smem_layout: cute.ComposedLayout,
         b_smem_layout: cute.ComposedLayout,
         epi_smem_layout: cute.ComposedLayout,
         epi_c_smem_layout: cute.ComposedLayout,
-        tile_sched_params: ParamsBase,
         TileSchedulerCls: cutlass.Constexpr[Callable],
     ):
         """
         GPU device kernel performing the batched GEMM computation.
@@ -575,6 +604,10 @@ class GemmSm90:
         :type epi_smem_layout: cute.ComposedLayout
         """
         varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
         varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
         assert not (varlen_m and varlen_k)
@@ -585,17 +618,13 @@ class GemmSm90:
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        # /////////////////////////////////////////////////////////////////////////////
-        #  Prefetch Tma desc
-        # /////////////////////////////////////////////////////////////////////////////
         if warp_idx == self.ab_load_warp_id:
             for tma_atom in (tma_atom_a, tma_atom_b, tma_atom_d, tma_atom_c):
                 if const_expr(tma_atom is not None):
                     cpasync.prefetch_descriptor(tma_atom)
-        # /////////////////////////////////////////////////////////////////////////////
-        #  Alloc and init AB full/empty + ACC full mbar (pipeline)
-        # /////////////////////////////////////////////////////////////////////////////
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(self.shared_storage)
@@ -611,28 +640,24 @@ class GemmSm90:
                 epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
             )
         sched_pipeline = None
-        tile_count = None
-        if const_expr(tile_sched_params.tile_count_semaphore is not None):
-            # Dynamic persistent scheduler
             sched_pipeline = self.make_sched_pipeline(
                 cluster_layout_mnk,
                 sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
                 varlen_k=varlen_k,
             )
-            tile_count = storage.tile_count.get_tensor((self.sched_stage,))
-        # ///////////////////////////////////////////////////////////////////////////////
-        #  Generate smem tensor A/B
-        # ///////////////////////////////////////////////////////////////////////////////
         sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
         sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
         sD = None
         if const_expr(has_D):
-            if const_expr(not self.is_persistent):
-                sD_ptr = cute.recast_ptr(sA.iterator, epi_smem_layout.inner, dtype=self.d_dtype)
-                sD = cute.make_tensor(sD_ptr, epi_smem_layout.outer)
-            else:
-                sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
         sC = None
         if const_expr(has_C):
             sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
@@ -640,37 +665,32 @@ class GemmSm90:
         varlen_manager = VarlenManager.create(
             varlen_params,
-            has_D,
-            self.num_epi_tensormaps,
             # Only used if not varlen_m
             len_m_static=Int32(
-                mA_mkl.shape[0]
                 if varlen_k or varlen_params.mAIdx is None
                 else varlen_params.mAIdx.shape[0]
             ),
-            len_k_static=Int32(mA_mkl.shape[1]),
-            pingpong=self.pingpong,
-            warp_idx=warp_idx,
         )
         TileSchedulerCls = partial(
-            TileSchedulerCls.create, tile_sched_params, tile_count, sched_pipeline
         )
         if warp_idx >= self.ab_load_warp_id:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_load)
             if (
                 warp_idx >= self.ab_load_warp_id
                 and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
             ):
-                is_tma_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
-                # initialize tensormap for A & B
-                varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
-                tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
-                tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
-                # ///////////////////////////////////////////////////////////////////////////////
                 # Get mcast mask
-                # ///////////////////////////////////////////////////////////////////////////////
                 cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
                 block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
                 a_mcast_mask = cute.make_layout_image_mask(
@@ -686,26 +706,17 @@ class GemmSm90:
                 is_scheduler_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
                 if const_expr(cute.size(cluster_layout_mnk) > 1):
                     is_scheduler_warp = is_scheduler_warp and cute.arch.block_idx_in_cluster() == 0
-                tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
                 work_tile = tile_scheduler.initial_work_tile_info()
                 ab_producer_state = make_pipeline_state(
                     pipeline.PipelineUserType.Producer, self.ab_stage
                 )
-                if const_expr(varlen_k):
-                    # wait tensormap initialization complete before update
-                    varlen_manager.fence_tensormap_init()
                 while work_tile.is_valid_tile:
                     tile_coord_mnkl = work_tile.tile_idx
                     batch_idx = tile_coord_mnkl[3]
-                    varlen_manager.update_tensormap_AB(
-                        batch_idx,
-                        self.a_layout,
-                        self.b_layout,
-                        is_tma_warp,
-                    )
-                    # ///////////////////////////////////////////////////////////////////////////
-                    #  Local_tile partition global tensors
-                    # ///////////////////////////////////////////////////////////////////////////
                     if const_expr(not self.gather_A):
                         mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
                         # (bM, bK, RestK)
@@ -714,37 +725,7 @@ class GemmSm90:
                             cute.select(self.cta_tile_shape_mnk, [0, 2]),
                             (tile_coord_mnkl[0], None),
                         )
-                    else:
-                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
-                        if const_expr(varlen_m):
-                            gAIdx = cute.local_tile(
-                                mAIdx_mk, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0],)
-                            )
-                            # (M, K)
-                            mA_mk = mA_mkl
-                        else:
-                            assert varlen_k
-                            # (tile_K, RestK)
-                            gAIdx = cute.flat_divide(mAIdx_mk, (self.cta_tile_shape_mnk[2],))
-                            # (tile_M, K)
-                            mA_mk = cute.local_tile(
-                                mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
-                            )
-                    # (bN, bK, RestK)
-                    gB_nk = cute.local_tile(
-                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
-                        cute.select(self.cta_tile_shape_mnk, [1, 2]),
-                        (tile_coord_mnkl[1], None),
-                    )
-                    # //////////////////////////////////////////////////////////////////////////
-                    #  Partition shared tensor for TMA load A/B
-                    # //////////////////////////////////////////////////////////////////////////
-                    varlen_manager.fence_tensormap_update_AB(is_tma_warp)
-                    len_m = varlen_manager.len_m(batch_idx)
-                    len_k = varlen_manager.len_k(batch_idx)
-                    #  TMA load A partition_S/D
-                    copy_A = None
-                    if const_expr(not self.gather_A):
                         copy_A, _, _ = copy_utils.tma_get_copy_fn(
                             tma_atom_a,
                             cta_coord=block_in_cluster_coord_mnk[1],
@@ -754,35 +735,17 @@ class GemmSm90:
                             src_tensor=gA_mk,
                             dst_tensor=sA,
                             mcast_mask=a_mcast_mask,
-                            tma_desc_ptr=tma_desc_a_ptr,
                         )
                     else:
-                        tiled_copy_A = self._make_gmem_tiled_copy_A(
-                            mA_mkl.element_type, self.a_layout, self.num_ab_load_warps * 32
-                        )
-                        tidx = (
-                            cute.arch.thread_idx()[0] - cute.arch.WARP_SIZE * self.ab_load_warp_id
                         )
-                        thr_copy_A = tiled_copy_A.get_slice(tidx)
-                        copy_A, prefetch_A = None, None
-                        if const_expr(varlen_m):
-                            copy_A = copy_utils.gather_m_get_copy_fn(
-                                thr_copy_A,
-                                mA_mk,
-                                sA,
-                                gAIdx,
-                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
-                                limit_k=len_k,
-                            )
-                        else:
-                            copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
-                                thr_copy_A,
-                                mA_mk,
-                                sA,
-                                gAIdx,
-                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
-                                limit_k=len_k,
-                            )
                     # TMA load B partition_S/D
                     copy_B, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_b,
@@ -793,8 +756,8 @@ class GemmSm90:
                         src_tensor=gB_nk,
                         dst_tensor=sB,
                         mcast_mask=b_mcast_mask,
-                        tma_desc_ptr=tma_desc_b_ptr,
                     )
                     k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                     if const_expr(not self.gather_A):
                         ab_producer_state = self.load_AB(
@@ -810,56 +773,47 @@ class GemmSm90:
                             k_tile_cnt,
                             varlen_m=varlen_m,
                         )
-                    tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
                     tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
                     work_tile = tile_scheduler.get_current_work()
                     # End of persistent scheduler loop
                 if const_expr(self.pingpong and not varlen_k):
                     # Need to write the tile_idx to smem for the next WG in the pingpong mode
-                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
-                ab_pipeline.producer_tail(ab_producer_state)
                 if is_scheduler_warp:
                     tile_scheduler.producer_tail()
         if warp_idx < self.ab_load_warp_id:
-            cute.arch.warpgroup_reg_alloc(self.num_regs_mma)
             is_tma_warp = Boolean(
                 (not self.pingpong and warp_idx == 0)
                 or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
             )
-            varlen_manager.init_tensormap_epi(
-                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
-            )
-            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
-            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
-            # //////////////////////////////////////////////////////////////////////////////
-            #  Partition global tensor for TiledMMA_A/B/C
-            # //////////////////////////////////////////////////////////////////////////////
             tidx, _, _ = cute.arch.thread_idx()
             warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
             if const_expr(self.pingpong):
                 tidx = tidx % self.num_threads_per_warp_group
             warp_group_thread_layout = cute.make_layout(
-                self.mma_warp_groups if not self.pingpong else 1,
                 stride=self.num_threads_per_warp_group,
             )
             thr_mma = tiled_mma.get_slice(
                 warp_group_thread_layout(warp_group_idx if not self.pingpong else 0)
             )
-            # //////////////////////////////////////////////////////////////////////////////
-            #  Make fragments
-            # //////////////////////////////////////////////////////////////////////////////
-            tCrA = tiled_mma.make_fragment_A(thr_mma.partition_A(sA))
-            tCrB = tiled_mma.make_fragment_B(thr_mma.partition_B(sB))
-            acc_shape = tiled_mma.partition_shape_C(
-                cute.select(self.cta_tile_shape_mnk, mode=[0, 1])
             )
-            acc = cute.make_fragment(acc_shape, self.acc_dtype)
             acc_slow = None
             if const_expr(self.fp8_slow_accum):
-                acc_slow = cute.make_fragment(acc_shape, self.acc_dtype)
             if const_expr(self.pingpong):
                 if warp_group_idx == 0:
@@ -867,7 +821,9 @@ class GemmSm90:
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="mma")
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="epi")
-            k_tile_cnt_static = cute.ceil_div(mA_mkl.shape[1], self.cta_tile_shape_mnk[2])
             c_tile_cnt = cute.size(cute.ceil_div(self.cta_tile_shape_mnk[:2], self.epi_tile))
             ab_read_state = make_pipeline_state(pipeline.PipelineUserType.Consumer, self.ab_stage)
@@ -879,10 +835,8 @@ class GemmSm90:
                 pipeline.PipelineUserType.Producer, self.epi_c_stage
             )
             tile_scheduler = TileSchedulerCls()
-            work_tile = None
             if const_expr(self.pingpong):
-                if const_expr(varlen_k):
-                    work_tile = tile_scheduler.initial_work_tile_info()
                 if warp_idx >= 4:
                     # Advance 2nd Math WG pipeline states to the end of 1st Math WG
                     epi_read_state.advance_iters(c_tile_cnt)
@@ -893,58 +847,29 @@ class GemmSm90:
                         len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
                         k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                         ab_read_state.advance_iters(k_tile_cnt)
                     tile_scheduler.advance_to_next_work()
-                    if const_expr(varlen_k):
-                        work_tile = tile_scheduler.get_current_work()
-                if const_expr(not varlen_k):
-                    work_tile = tile_scheduler.initial_work_tile_info()
-            else:
-                work_tile = tile_scheduler.initial_work_tile_info()
-            if const_expr(varlen_m):
-                # wait tensormap initialization complete before update
-                varlen_manager.fence_tensormap_init()
             while work_tile.is_valid_tile:
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
-                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
-                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
-                )
-                varlen_manager.update_tensormap_epi(
-                    batch_idx,
-                    self.d_layout,
-                    epi_shapes,
-                    epi_orders,
-                    is_tma_warp,
-                )
                 len_k = varlen_manager.len_k(batch_idx)
                 k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
-                ab_read_state, tiled_mma = self.mma(
-                    ab_pipeline,
-                    ab_read_state,
-                    tiled_mma,
-                    tCrA,
-                    tCrB,
-                    acc,
-                    acc_slow,
-                    k_tile_cnt,
-                    warp_group_idx,
                 )
                 if const_expr(varlen_k):
                     if k_tile_cnt == 0:
                         acc.fill(0.0)
-                # /////////////////////////////////////////////////////////////////////////////
-                #  EPILOGUE
-                # /////////////////////////////////////////////////////////////////////////////
                 if const_expr(self.pingpong):
                     self.pingpong_barrier_sync(warp_group_idx, "epi")
-                epilogue_barrier = pipeline.NamedBarrier(
-                    barrier_id=int(NamedBarrierGemm.Epilogue),
-                    num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
-                )
-                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
                 copy_D = None
                 if const_expr(has_D):
@@ -955,7 +880,6 @@ class GemmSm90:
                         self.epi_tile,
                         sD,
                         tile_coord_mnkl,
-                        tma_desc_ptr=tma_desc_d_ptr,
                     )
                 copy_C = None
                 if const_expr(has_C):
@@ -973,8 +897,8 @@ class GemmSm90:
                 tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
                     tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
                 )
-                # (R2S, R2S_M, R2S_N)
-                tRS_rAcc = tiled_copy_r2s.retile(acc)
                 load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
                 if const_expr(has_C):
                     tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
@@ -983,17 +907,11 @@ class GemmSm90:
                 else:
                     tiled_copy_s2r, tSR_sC, tRS_rC, tSR_rC = None, None, None, None
-                # Wait for all warp groups in the thread block to finish, because smem for tensor
-                # A in the mainloop is reused in the epilogue if not persistent.
-                if const_expr(not self.is_persistent):
-                    epilogue_barrier.arrive_and_wait()
                 self.epi_visit_acc(epilogue_params, acc, tiled_mma, tile_coord_mnkl, tidx)
                 epi_read_state, epi_producer_state = self.epilogue(
                     epilogue_params,
                     epi_smem_tensors,
-                    tma_desc_epi_ptrs,
                     epi_pipeline,
                     epi_store_pipeline,
                     epi_read_state,
@@ -1012,7 +930,7 @@ class GemmSm90:
                     copy_C,
                     tile_coord_mnkl,
                     varlen_manager,
-                    epilogue_barrier,
                     tile_scheduler,
                     tidx,
                     is_tma_warp,
@@ -1025,6 +943,7 @@ class GemmSm90:
                     if is_tma_warp:
                         epi_store_pipeline.producer_tail()
                     self.pingpong_barrier_arrive(1 - warp_group_idx, stage="epi")
                 if const_expr(not self.pingpong):
                     tile_scheduler.advance_to_next_work()
@@ -1049,11 +968,17 @@ class GemmSm90:
                             work_tile = tile_scheduler.get_current_work()
                 # End of persistent scheduler loop
             # Wait for D store complete
             if const_expr(not self.pingpong):
                 if is_tma_warp:
                     epi_store_pipeline.producer_tail()
     @cute.jit
     def load_AB(
         self,
@@ -1073,9 +998,7 @@ class GemmSm90:
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
-        # /////////////////////////////////////////////////////////////////////////
         # TMA load
-        # /////////////////////////////////////////////////////////////////////////
         for k_tile in cutlass.range(k_tile_cnt, unroll=1):
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
@@ -1112,9 +1035,7 @@ class GemmSm90:
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
-        # /////////////////////////////////////////////////////////////////////////
         # TMA load on B and cp.async on A
-        # /////////////////////////////////////////////////////////////////////////
         for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
             prefetch_out = ()
             if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
@@ -1122,11 +1043,7 @@ class GemmSm90:
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
             # A tiny bit faster to rotate the warp that does TMA
-            # However, for varlen_k, we must use the warp_idx == self.ab_load_warp_id
-            # since that's the warp that does the tensormap update.
-            is_tma_warp = warp_idx == self.ab_load_warp_id + (
-                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
-            )
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
             smem_idx = ab_producer_state.index
             # A bit faster to load B first while we calculate the indices for A
@@ -1146,9 +1063,7 @@ class GemmSm90:
             prefetch_out = ()
             if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
                 prefetch_out = (prefetch_A(k_tile, pred=True),)
-            is_tma_warp = warp_idx == self.ab_load_warp_id + (
-                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
-            )
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
             smem_idx = ab_producer_state.index
             if is_tma_warp:
@@ -1159,41 +1074,78 @@ class GemmSm90:
             ab_producer_state.advance()
         return ab_producer_state
     @cute.jit
     def mma(
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_read_state: cutlass.pipeline.PipelineState,
-        tiled_mma: cute.TiledMma,
-        tCrA: cute.Tensor,
-        tCrB: cute.Tensor,
         acc: cute.Tensor,
         acc_slow: Optional[cute.Tensor],
         k_tile_cnt: Int32,
         warp_group_idx: Int32,
-    ) -> Tuple[cutlass.pipeline.PipelineState, cute.TiledMma]:
-        # /////////////////////////////////////////////////////////////////////////////
-        #  Prologue MMAs
-        # /////////////////////////////////////////////////////////////////////////////
         k_pipe_mmas = 1
         ab_release_state = ab_read_state.clone()
         num_prologue_mma = min(k_pipe_mmas, k_tile_cnt)
-        if const_expr(self.pingpong):
-            self.pingpong_barrier_sync(warp_group_idx, stage="mma")
         peek_ab_full_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
-        tiled_mma.set(warpgroup.Field.ACCUMULATE, False)
-        num_k_blocks = cute.size(tCrA, mode=[2])
         for k_tile in cutlass.range(num_prologue_mma):
             # Wait for A/B buffer to be ready
             ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
-            warpgroup.fence()
-            for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
-                k_blk_coord = (None, None, k_blk_idx, ab_read_state.index)
-                cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
-                tiled_mma.set(warpgroup.Field.ACCUMULATE, True)
-            warpgroup.commit_group()
             ab_read_state.advance()
             peek_ab_full_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
@@ -1204,21 +1156,14 @@ class GemmSm90:
             warpgroup.wait_group(0)
             acc_slow.store(acc.load())
-        # /////////////////////////////////////////////////////////////////////////////
-        #  MAINLOOP
-        # /////////////////////////////////////////////////////////////////////////////
         for k_tile in cutlass.range(num_prologue_mma, k_tile_cnt, unroll=1):
             # Wait for TMA copies to complete
             ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
-            # WGMMA
-            warpgroup.fence()
             if const_expr(self.fp8_slow_accum):
-                tiled_mma.set(warpgroup.Field.ACCUMULATE, False)
-            for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
-                k_blk_coord = (None, None, k_blk_idx, ab_read_state.index)
-                cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
-                tiled_mma.set(warpgroup.Field.ACCUMULATE, True)
-            warpgroup.commit_group()
             # Wait on the wgmma barrier for previous k_pipe_mmas wgmmas to complete
             if const_expr(not self.fp8_slow_accum):
                 warpgroup.wait_group(k_pipe_mmas)
@@ -1242,16 +1187,13 @@ class GemmSm90:
             ab_release_state.advance()
         if const_expr(self.fp8_slow_accum):
             acc.store(acc_slow.load())
-        # If we don't return the tiled_mma, we get compiler error
-        # "operand #0 does not dominate this use"
-        return ab_read_state, tiled_mma
     @cute.jit
     def epilogue(
         self,
         params: EpilogueParams,
         epi_smem_tensors: Tuple[cute.Tensor, ...],
-        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
         epi_pipeline: cutlass.pipeline.PipelineAsync,
         epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
@@ -1277,6 +1219,18 @@ class GemmSm90:
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
         epi_tile_shape = cute.zipped_divide(
             cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
         ).shape[1]
@@ -1306,26 +1260,6 @@ class GemmSm90:
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
-        def tma_store_fn(src_idx, dst_idx):
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                if const_expr(has_D):
-                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
-            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
-            epilogue_barrier.arrive_and_wait()
-        # We could delay the TMA store by 1 epi tile to better overlap the non-TMA ops
-        # with the TMA store. However, currently this doesn't seem to improve perf.
-        delay_tma_store = False
-        src_idx_prev, dst_idx_prev = None, None
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
             # The global memory coordinate for the current epi tile
             gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
@@ -1336,9 +1270,7 @@ class GemmSm90:
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
                 # Fence to make sure shared memory read is visible to TMA load
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
                 cute.arch.sync_warp()
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
@@ -1350,20 +1282,63 @@ class GemmSm90:
                     copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
-            tRS_rEpi = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
-            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
-            if const_expr(delay_tma_store):
-                if const_expr(epi_idx > 0):
-                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
-                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
             # Copy from D registers to shared memory
             if const_expr(has_D):
-                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
-            if const_expr(not delay_tma_store):
-                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
-        if const_expr(delay_tma_store):
-            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
         self.epi_end(
             params,
@@ -1389,8 +1364,18 @@ class GemmSm90:
         mD: Optional[cute.Tensor],
         scheduler_args,
         varlen_args,
     ):
         """Create scheduler arguments. Override in subclasses for custom schedulers."""
         if const_expr(varlen_args.mCuSeqlensM is None):
             num_problems = (
                 mD.shape[2]
@@ -1402,8 +1387,8 @@ class GemmSm90:
                 )
             )
             problem_shape_ntile_mnl = (
-                cute.ceil_div(mA.shape[0], self.cta_tile_shape_mnk[0]),
-                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
                 num_problems,
             )
             tile_sched_args = TileSchedulerArguments(
@@ -1413,13 +1398,13 @@ class GemmSm90:
                 cluster_shape_mnk=self.cluster_shape_mnk,
                 tile_count_semaphore=scheduler_args.tile_count_semaphore,
                 batch_idx_permute=scheduler_args.batch_idx_permute,
-                is_persistent=self.is_persistent,
             )
         else:
-            assert mD is not None or not self.gather_A
             problem_shape_ntile_mnl = (
                 None,
-                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
                 varlen_args.mCuSeqlensM.shape[0] - 1,
             )
             tile_sched_args = VarlenMTileSchedulerArguments(
@@ -1431,14 +1416,17 @@ class GemmSm90:
                 tile_shape_mn=self.cta_tile_shape_mnk[:2],
                 cluster_shape_mnk=self.cluster_shape_mnk,
                 tile_count_semaphore=scheduler_args.tile_count_semaphore,
-                is_persistent=self.is_persistent,
             )
         return tile_sched_args
     @cute.jit
     def epi_load_acc_subtile(self, tRS_rAcc: cute.Tensor, tRS_rD: cute.Tensor, epi_idx: int):
-        for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
-            tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
     @cute.jit
     def epi_begin(
@@ -1504,18 +1492,6 @@ class GemmSm90:
         """Subclasses can override this"""
         return []
-    def epi_get_tensormap_update_shapes_orders(
-        self,
-        params: EpilogueParams,
-        cu_seqlens_m: cute.Tensor,
-        batch_idx: Int32,
-        *,
-        loc=None,
-        ip=None,
-    ) -> tuple[list[Int32], list[int]]:
-        """Subclasses can override this"""
-        return [], []
     @staticmethod
     def epi_smem_bytes_per_stage(
         args: Optional[EpilogueArguments],
@@ -1579,7 +1555,7 @@ class GemmSm90:
         tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
         sD_shape = sD.shape[:2] if sD is not None else self.epi_tile
         tRS_rD_shape = thr_copy_r2s.partition_S(cute.make_identity_tensor(sD_shape)).shape
-        tRS_rD = cute.make_fragment(tRS_rD_shape, self.acc_dtype)
         return tiled_copy_r2s, tRS_rD, tRS_sD
     def epilog_smem_load_and_partition(
@@ -1596,7 +1572,7 @@ class GemmSm90:
         tiled_copy_s2r = cute.make_tiled_copy_S(copy_atom_s2r, tiled_copy_C_atom)
         thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
         tSR_sC = thr_copy_s2r.partition_S(sC)
-        tRS_rC = cute.make_fragment(tRS_rD_layout, dtype)
         tSR_rC = thr_copy_s2r.retile(tRS_rC)
         return tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC
@@ -1608,7 +1584,6 @@ class GemmSm90:
         epi_tile: cute.Tile,
         sD: cute.Tensor,
         tile_coord_mnkl: cute.Coord,
-        tma_desc_ptr: Optional[cute.Pointer] = None,
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         # (bM, bN)
         gD = cute.local_tile(mD_mn, tile_shape_mn, tile_coord_mnkl[:2])
@@ -1625,7 +1600,6 @@ class GemmSm90:
             cta_layout=cute.make_layout(1),
             src_tensor=src_tensor,
             dst_tensor=dst_tensor,
-            tma_desc_ptr=tma_desc_ptr,
         )
     def make_ab_pipeline(
@@ -1651,6 +1625,7 @@ class GemmSm90:
             consumer_group=ab_pipeline_consumer_group,
             tx_count=self.num_tma_load_bytes,
             cta_layout_vmnk=cluster_layout_vmnk,
         )
     def make_epi_pipeline(
@@ -1670,6 +1645,7 @@ class GemmSm90:
             producer_group=epi_pipeline_producer_group,
             consumer_group=epi_pipeline_consumer_group,
             tx_count=tma_copy_c_bytes,
         )
     def make_epi_store_pipeline(self):
@@ -1686,13 +1662,13 @@ class GemmSm90:
         # Threads/warps participating in this pipeline
         sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
         cluster_size = cute.size(cluster_layout_mnk)
-        # Each warp that are not the scheduler warp will contribute 1 to the arrive count
         # If pingpong and varlen_k, then all 8 mma warps will participate in the scheduler barrier
         # at each round. If pingpong and not varlen_k, then only 4 mma warp will participate.
         consumer_arrive_cnt = (
             (self.mma_warp_groups if not (self.pingpong and not varlen_k) else 1) * 4
             + self.num_ab_load_warps
-        ) * cluster_size - 1
         sched_pipeline_consumer_group = pipeline.CooperativeGroup(
             pipeline.Agent.Thread, consumer_arrive_cnt
         )
@@ -1703,6 +1679,7 @@ class GemmSm90:
             consumer_group=sched_pipeline_consumer_group,
             # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
             consumer_mask=None if const_expr(cluster_size == 1) else 0,
         )
     @classmethod
@@ -1717,7 +1694,6 @@ class GemmSm90:
         epilogue_args: EpilogueArguments,
         smem_capacity: int,
         occupancy: int,
-        overlap_sD_sA: bool = False,
     ) -> Tuple[int, int]:
         """Computes the number of stages for A/B/C operands based on heuristics.
@@ -1738,16 +1714,11 @@ class GemmSm90:
         """
         epi_stage = 4 if epi_tile[1] <= 16 else 2
-        if overlap_sD_sA:
-            epi_bytes = 0
-        else:
-            d_bytes_per_stage = (
-                cute.size(epi_tile) * d_dtype.width // 8 if d_dtype is not None else 0
-            )
-            epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
-                epilogue_args, cta_tile_shape_mnk, epi_tile
-            )
-            epi_bytes = epi_bytes_per_stage * epi_stage
         epi_c_stage = 0 if c_dtype is None else (4 if epi_tile[1] <= 16 else 2)
         if c_dtype is not None:
             epi_bytes += cute.size(epi_tile) * c_dtype.width // 8 * epi_c_stage
@@ -1765,7 +1736,7 @@ class GemmSm90:
         # Refine epilogue stages:
         # Calculate remaining smem after allocating for A/B stages and reserved bytes
         # Add remaining unused smem to epilogue
-        if not overlap_sD_sA and epi_bytes_per_stage > 0:
             epi_stage += (remaining_bytes - ab_bytes_per_stage * ab_stage) // epi_bytes_per_stage
         return ab_stage, epi_stage, epi_c_stage
@@ -2030,20 +2001,10 @@ class GemmSm90:
         :rtype: bool
         """
         is_valid = True
-        if a_dtype not in {
-            Float16,
-            cutlass.BFloat16,
-            cutlass.Float8E4M3FN,
-            cutlass.Float8E5M2,
-        }:
             is_valid = False
         # tested b_dtype
-        if b_dtype not in {
-            Float16,
-            cutlass.BFloat16,
-            cutlass.Float8E4M3FN,
-            cutlass.Float8E5M2,
-        }:
             is_valid = False
         if acc_dtype not in {Float32, Float16}:
             is_valid = False

+# Copyright (c) 2025-2026, Tri Dao.
 # Based on the cute-dsl example:
 # https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/hopper/dense_gemm.py
 import cutlass
 import cutlass.cute as cute
 import cutlass.pipeline as pipeline
+from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
 from cutlass.cute.nvgpu import cpasync, warp, warpgroup
 import cutlass.utils.hopper_helpers as sm90_utils
 from cutlass import Int32, Float32, Float16, Boolean, const_expr
 from cutlass.utils import LayoutEnum
+from dataclasses import dataclass
+from .cute_dsl_utils import ParamsBase
+from . import layout_utils
 from .tile_scheduler import (
     TileSchedulerOptions,
     TileSchedulerArguments,
     TileScheduler,
     VarlenMTileSchedulerArguments,
     VarlenMTileScheduler,
+    PersistenceMode,
 )
 from .varlen_utils import VarlenArguments, VarlenManager
 from .pipeline import make_pipeline_state, PipelineTmaCpAsync
 from . import copy_utils as copy_utils
 from . import sm90_utils as quack_sm90_utils
+from .rounding import RoundingMode
 """
 A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Hopper architecture
     """
     arch = 90
+    @dataclass
+    class EpilogueArguments:
+        pass
     EpilogueParams = ParamsBase
     def __init__(
         is_persistent: bool = True,
         fp8_fast_accum: bool = False,
         gather_A: bool = False,
+        use_clc_persistence: bool = False,
+        concat_layout: tuple | None = None,
+        use_pdl: bool = True,
     ):
         """
         Initializes the configuration for a Hopper dense GEMM kernel.
         self.acc_dtype = acc_dtype
         self.pingpong = pingpong
         self.is_persistent = is_persistent
+        self.use_clc_persistence = use_clc_persistence
+        if self.use_clc_persistence:
+            assert self.arch == 100
+        self.use_pdl = use_pdl
         if self.pingpong:
             assert self.is_persistent, "Pingpong gemm requires persistent scheduler"
         self.fp8_slow_accum = not fp8_fast_accum and a_dtype.width == 8
         self.gather_A = gather_A
+        self.concat_layout = concat_layout or ()
         if gather_A:
             assert cluster_shape_mnk[1] == 1, "Cluster shape N must be 1 for gather A "
         self.threads_per_cta = (self.mma_warp_groups + 1) * self.num_threads_per_warp_group
         self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_90")
         self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
+        self.epilogue_barrier = pipeline.NamedBarrier(
+            barrier_id=int(NamedBarrierGemm.Epilogue),
+            num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
+        )
         self.num_ab_load_warps = 1 if not self.gather_A else 4
         self.ab_load_warp_id = self.mma_warp_groups * 4
         regs_per_thread = math.prod(self.cta_tile_shape_mnk[:2]) // (
             math.prod(self.atom_layout_mnk) * self.num_threads_per_warp_group
         self.shared_storage = None
         self.buffer_align_bytes = 1024
+    def _setup_tiled_mma(self):
+        """Set up tiled MMA and tile K dimension. Override for different MMA types."""
         self.tiled_mma = sm90_utils.make_trivial_tiled_mma(
             self.a_dtype,
             self.b_dtype,
             mma_inst_shape_k * mma_inst_tile_k,
         )
+    def _setup_attributes(self, epilogue_args: EpilogueArguments):
+        """Set up configurations that are dependent on GEMM inputs
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        """
+        self._setup_tiled_mma()
         self.cluster_layout_mnk = cute.make_layout(self.cluster_shape_mnk)
         self.epi_tile = self._sm90_compute_tile_shape_or_override(
             epilogue_args,
             cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
             self.occupancy,
         )
         self.sched_stage = 2 if self.pingpong else 1
         mB: cute.Tensor,
         mD: Optional[cute.Tensor],
         mC: Optional[cute.Tensor],
+        epilogue_args: tuple,
         scheduler_args: TileSchedulerOptions,
         varlen_args: Optional[VarlenArguments],
         stream: cuda.CUstream,
+        trace_ptr: Optional[cutlass.Int64] = None,
     ):
         """Execute the GEMM operation in steps:
         - Setup static attributes
         :type stream: cuda.CUstream
         """
+        # Concat layout: interleave the non-contiguous dim (detected via leading_dim).
+        mA, mB, mD, mC = [
+            layout_utils.concat_to_interleave(mT, 1 - mT.leading_dim)
+            if const_expr(name in self.concat_layout and mT is not None)
+            else mT
+            for name, mT in [("A", mA), ("B", mB), ("out", mD), ("C", mC)]
+        ]
         # setup static attributes before smem/grid/tma computation
         self.a_dtype = mA.element_type
         self.b_dtype = mB.element_type
         if const_expr(varlen_args is None):
             varlen_args = VarlenArguments()
         assert (varlen_args.mAIdx is not None) == self.gather_A
+        varlen_m = varlen_args.mCuSeqlensM is not None
+        varlen_k = varlen_args.mCuSeqlensK is not None
         self._setup_attributes(epilogue_args)
         tma_atom_a, tma_tensor_a = None, None
         if const_expr(not self.gather_A):
             tma_atom_a, tma_tensor_a = self._make_tma_atoms_and_tensors(
+                copy_utils.create_ragged_tensor_for_tma(mA, ragged_dim=1)
+                if varlen_k and not self.gather_A
+                else mA,
                 a_smem_layout,
                 (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
                 self.cluster_shape_mnk[1],
             )
         tma_atom_b, tma_tensor_b = self._make_tma_atoms_and_tensors(
+            copy_utils.create_ragged_tensor_for_tma(mB, ragged_dim=1) if varlen_k else mB,
             b_smem_layout,
             (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
             self.cluster_shape_mnk[0],
         tma_atom_d, tma_tensor_d = None, None
         if const_expr(mD is not None):
             tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
+                copy_utils.create_ragged_tensor_for_tma(
+                    mD,
+                    ragged_dim=0,
+                    ptr_shift=True,
+                )
+                if varlen_m
+                else mD,
                 self.epi_smem_layout_staged,
                 self.epi_tile,
                 op_type="store"
         epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
         varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
+        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_m)
+        tile_sched_args = self.get_scheduler_arguments(
+            mA, mB, mD, scheduler_args, varlen_args, epilogue_args
+        )
         tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
         grid = TileSchedulerCls.get_grid_shape(
             tile_sched_params, scheduler_args.max_active_clusters
         )
+        epi_smem_size = cute.cosize(self.epi_smem_layout_staged) if mD is not None else 0
         epi_c_smem_size = cute.cosize(self.epi_c_smem_layout_staged) if mC is not None else 0
         @cute.struct
             ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
             epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
             sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
+            sched_data: cute.struct.MemRange[Int32, self.sched_stage * 4]
             sD: cute.struct.Align[
                 cute.struct.MemRange[
                     self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
             self.epi_c_smem_layout_staged,
             tile_sched_params,
             TileSchedulerCls,
+            trace_ptr,
         ).launch(
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
             stream=stream,
             min_blocks_per_mp=1,
+            use_pdl=self.use_pdl,
         )
         return
         mD_mnl: Optional[cute.Tensor],
         tma_atom_c: Optional[cute.CopyAtom],
         mC_mnl: Optional[cute.Tensor],
+        epilogue_params,
         varlen_params: VarlenManager.Params,
         cluster_layout_mnk: cute.Layout,
         a_smem_layout: cute.ComposedLayout,
         b_smem_layout: cute.ComposedLayout,
         epi_smem_layout: cute.ComposedLayout,
         epi_c_smem_layout: cute.ComposedLayout,
+        tile_sched_params,
         TileSchedulerCls: cutlass.Constexpr[Callable],
+        trace_ptr: Optional[cutlass.Int64] = None,
     ):
         """
         GPU device kernel performing the batched GEMM computation.
         :type epi_smem_layout: cute.ComposedLayout
         """
+        from .trace import TraceContext
+        tctx = TraceContext.create(trace_ptr)
         varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
         varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
         assert not (varlen_m and varlen_k)
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # Prefetch Tma desc
         if warp_idx == self.ab_load_warp_id:
             for tma_atom in (tma_atom_a, tma_atom_b, tma_atom_d, tma_atom_c):
                 if const_expr(tma_atom is not None):
                     cpasync.prefetch_descriptor(tma_atom)
+        # Alloc and init AB full/empty + ACC full mbar (pipeline)
         smem = cutlass.utils.SmemAllocator()
         storage = smem.allocate(self.shared_storage)
                 epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
             )
         sched_pipeline = None
+        sched_data = None
+        if const_expr(self.is_persistent):
             sched_pipeline = self.make_sched_pipeline(
                 cluster_layout_mnk,
                 sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
                 varlen_k=varlen_k,
             )
+            sched_data = storage.sched_data.get_tensor((4, self.sched_stage))
+        # Cluster arrive after barrier init
+        pipeline_init_arrive(cluster_shape_mn=self.cluster_shape_mnk[:-1], is_relaxed=True)
+        # Generate smem tensor A/B
         sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
         sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
         sD = None
         if const_expr(has_D):
+            sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
         sC = None
         if const_expr(has_C):
             sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
         varlen_manager = VarlenManager.create(
             varlen_params,
             # Only used if not varlen_m
             len_m_static=Int32(
+                cute.size(mA_mkl, mode=[0])
                 if varlen_k or varlen_params.mAIdx is None
                 else varlen_params.mAIdx.shape[0]
             ),
+            len_k_static=Int32(cute.size(mA_mkl, mode=[1])),
         )
         TileSchedulerCls = partial(
+            TileSchedulerCls.create, tile_sched_params, sched_data, sched_pipeline
         )
+        # Cluster wait for barrier init
+        pipeline_init_wait(cluster_shape_mn=self.cluster_shape_mnk[:-1])
         if warp_idx >= self.ab_load_warp_id:
+            cute.arch.setmaxregister_decrease(self.num_regs_load)
             if (
                 warp_idx >= self.ab_load_warp_id
                 and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
             ):
+                # PDL: wait for prior kernel before any TMA loads (matches cutlass C++ sm90 mainloop producer)
+                if const_expr(self.use_pdl):
+                    cute.arch.griddepcontrol_wait()
                 # Get mcast mask
                 cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
                 block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
                 a_mcast_mask = cute.make_layout_image_mask(
                 is_scheduler_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
                 if const_expr(cute.size(cluster_layout_mnk) > 1):
                     is_scheduler_warp = is_scheduler_warp and cute.arch.block_idx_in_cluster() == 0
+                tile_scheduler = TileSchedulerCls()
                 work_tile = tile_scheduler.initial_work_tile_info()
                 ab_producer_state = make_pipeline_state(
                     pipeline.PipelineUserType.Producer, self.ab_stage
                 )
                 while work_tile.is_valid_tile:
+                    tctx.b("tma_load")
                     tile_coord_mnkl = work_tile.tile_idx
                     batch_idx = tile_coord_mnkl[3]
+                    # Local_tile partition global tensors
+                    copy_A, prefetch_A = None, None
                     if const_expr(not self.gather_A):
                         mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
                         # (bM, bK, RestK)
                             cute.select(self.cta_tile_shape_mnk, [0, 2]),
                             (tile_coord_mnkl[0], None),
                         )
+                        #  TMA load A partition_S/D
                         copy_A, _, _ = copy_utils.tma_get_copy_fn(
                             tma_atom_a,
                             cta_coord=block_in_cluster_coord_mnk[1],
                             src_tensor=gA_mk,
                             dst_tensor=sA,
                             mcast_mask=a_mcast_mask,
                         )
                     else:
+                        copy_A, prefetch_A = self._make_gather_A_copy(
+                            mA_mkl, sA, varlen_manager, tile_coord_mnkl, batch_idx
                         )
+                    # (bN, bK, RestK)
+                    gB_nk = cute.local_tile(
+                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                        cute.select(self.cta_tile_shape_mnk, [1, 2]),
+                        (tile_coord_mnkl[1], None),
+                    )
                     # TMA load B partition_S/D
                     copy_B, _, _ = copy_utils.tma_get_copy_fn(
                         tma_atom_b,
                         src_tensor=gB_nk,
                         dst_tensor=sB,
                         mcast_mask=b_mcast_mask,
                     )
+                    len_k = varlen_manager.len_k(batch_idx)
                     k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                     if const_expr(not self.gather_A):
                         ab_producer_state = self.load_AB(
                             k_tile_cnt,
                             varlen_m=varlen_m,
                         )
+                    tctx.e("tma_load")
                     tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
                     work_tile = tile_scheduler.get_current_work()
                     # End of persistent scheduler loop
                 if const_expr(self.pingpong and not varlen_k):
                     # Need to write the tile_idx to smem for the next WG in the pingpong mode
+                    if is_scheduler_warp:
+                        tile_scheduler.write_work_tile_to_smem(work_tile)
+                    work_tile = tile_scheduler.get_current_work()
+                if warp_idx == self.ab_load_warp_id:
+                    ab_pipeline.producer_tail(ab_producer_state)
                 if is_scheduler_warp:
                     tile_scheduler.producer_tail()
         if warp_idx < self.ab_load_warp_id:
+            cute.arch.setmaxregister_increase(self.num_regs_mma)
             is_tma_warp = Boolean(
                 (not self.pingpong and warp_idx == 0)
                 or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
             )
+            # Partition global tensor for TiledMMA_A/B/C
             tidx, _, _ = cute.arch.thread_idx()
             warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
             if const_expr(self.pingpong):
                 tidx = tidx % self.num_threads_per_warp_group
             warp_group_thread_layout = cute.make_layout(
+                self.mma_warp_groups if const_expr(not self.pingpong) else 1,
                 stride=self.num_threads_per_warp_group,
             )
             thr_mma = tiled_mma.get_slice(
                 warp_group_thread_layout(warp_group_idx if not self.pingpong else 0)
             )
+            # Make fragments
+            acc, tCrA, tCrB = quack_sm90_utils.partition_fragment_ABC(
+                thr_mma, self.cta_tile_shape_mnk, sA, sB
             )
             acc_slow = None
             if const_expr(self.fp8_slow_accum):
+                acc_slow = cute.make_rmem_tensor(acc.shape, self.acc_dtype)
+            mma_fn = partial(quack_sm90_utils.gemm_w_idx, tiled_mma, acc, tCrA, tCrB)
             if const_expr(self.pingpong):
                 if warp_group_idx == 0:
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="mma")
                     self.pingpong_barrier_arrive(warp_group_idx=0, stage="epi")
+            k_tile_cnt_static = cute.ceil_div(
+                cute.size(mA_mkl, mode=[1]), self.cta_tile_shape_mnk[2]
+            )
             c_tile_cnt = cute.size(cute.ceil_div(self.cta_tile_shape_mnk[:2], self.epi_tile))
             ab_read_state = make_pipeline_state(pipeline.PipelineUserType.Consumer, self.ab_stage)
                 pipeline.PipelineUserType.Producer, self.epi_c_stage
             )
             tile_scheduler = TileSchedulerCls()
+            work_tile = tile_scheduler.initial_work_tile_info()
             if const_expr(self.pingpong):
                 if warp_idx >= 4:
                     # Advance 2nd Math WG pipeline states to the end of 1st Math WG
                     epi_read_state.advance_iters(c_tile_cnt)
                         len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
                         k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
                         ab_read_state.advance_iters(k_tile_cnt)
+                    # TODO: do we need to check if work_tile is valid?
                     tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
             while work_tile.is_valid_tile:
                 tile_coord_mnkl = work_tile.tile_idx
                 batch_idx = tile_coord_mnkl[3]
                 len_k = varlen_manager.len_k(batch_idx)
                 k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                if const_expr(self.pingpong):
+                    self.pingpong_barrier_sync(warp_group_idx, stage="mma")
+                tctx.b("mma")
+                ab_read_state = self.mma(
+                    ab_pipeline, ab_read_state, mma_fn, acc, acc_slow, k_tile_cnt, warp_group_idx
                 )
                 if const_expr(varlen_k):
                     if k_tile_cnt == 0:
                         acc.fill(0.0)
+                tctx.e("mma")
+                # EPILOGUE
                 if const_expr(self.pingpong):
                     self.pingpong_barrier_sync(warp_group_idx, "epi")
+                tctx.b("epilogue")
                 copy_D = None
                 if const_expr(has_D):
                         self.epi_tile,
                         sD,
                         tile_coord_mnkl,
                     )
                 copy_C = None
                 if const_expr(has_C):
                 tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
                     tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
                 )
+                # (R2S, R2S_M, R2S_N, num_epi)
+                tRS_rAcc = self.epi_retile_acc(acc, tRS_rD, tiled_copy_r2s)
                 load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
                 if const_expr(has_C):
                     tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
                 else:
                     tiled_copy_s2r, tSR_sC, tRS_rC, tSR_rC = None, None, None, None
                 self.epi_visit_acc(epilogue_params, acc, tiled_mma, tile_coord_mnkl, tidx)
                 epi_read_state, epi_producer_state = self.epilogue(
                     epilogue_params,
                     epi_smem_tensors,
                     epi_pipeline,
                     epi_store_pipeline,
                     epi_read_state,
                     copy_C,
                     tile_coord_mnkl,
                     varlen_manager,
+                    self.epilogue_barrier,
                     tile_scheduler,
                     tidx,
                     is_tma_warp,
                     if is_tma_warp:
                         epi_store_pipeline.producer_tail()
                     self.pingpong_barrier_arrive(1 - warp_group_idx, stage="epi")
+                tctx.e("epilogue")
                 if const_expr(not self.pingpong):
                     tile_scheduler.advance_to_next_work()
                             work_tile = tile_scheduler.get_current_work()
                 # End of persistent scheduler loop
+            # PDL: hint next kernel to launch (matches cutlass C++ sm90 consumer)
+            if const_expr(self.use_pdl):
+                cute.arch.griddepcontrol_launch_dependents()
             # Wait for D store complete
             if const_expr(not self.pingpong):
                 if is_tma_warp:
                     epi_store_pipeline.producer_tail()
+        tctx.flush()
     @cute.jit
     def load_AB(
         self,
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
         # TMA load
         for k_tile in cutlass.range(k_tile_cnt, unroll=1):
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
         peek_ab_empty_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
         # TMA load on B and cp.async on A
         for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
             prefetch_out = ()
             if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
             # Wait for A/B buffers to be empty before loading into them
             # Also sets the transaction barrier for the A/B buffers
             # A tiny bit faster to rotate the warp that does TMA
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (k_tile % self.num_ab_load_warps)
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
             smem_idx = ab_producer_state.index
             # A bit faster to load B first while we calculate the indices for A
             prefetch_out = ()
             if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
                 prefetch_out = (prefetch_A(k_tile, pred=True),)
+            is_tma_warp = warp_idx == self.ab_load_warp_id + k_tile % self.num_ab_load_warps
             ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
             smem_idx = ab_producer_state.index
             if is_tma_warp:
             ab_producer_state.advance()
         return ab_producer_state
+    @cute.jit
+    def _make_gather_A_copy(
+        self,
+        mA_mkl: cute.Tensor,
+        sA: cute.Tensor,
+        varlen_manager: VarlenManager,
+        tile_coord_mnkl,
+        batch_idx: Int32,
+    ):
+        """Create copy_A and prefetch_A for gather_A (shared by SM90/SM120 DMA)."""
+        varlen_m = varlen_manager.varlen_m
+        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+        if const_expr(varlen_m):
+            gAIdx = cute.local_tile(mAIdx_mk, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0],))
+            mA_mk = mA_mkl
+        else:
+            gAIdx = cute.flat_divide(mAIdx_mk, (self.cta_tile_shape_mnk[2],))
+            mA_mk = cute.local_tile(
+                mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
+            )
+        len_m = varlen_manager.len_m(batch_idx)
+        len_k = varlen_manager.len_k(batch_idx)
+        tiled_copy_A = self._make_gmem_tiled_copy_A(
+            mA_mkl.element_type, self.a_layout, self.num_ab_load_warps * 32
+        )
+        dma_tidx = cute.arch.thread_idx()[0] - cute.arch.WARP_SIZE * self.ab_load_warp_id
+        thr_copy_A = tiled_copy_A.get_slice(dma_tidx)
+        copy_A, prefetch_A = None, None
+        if const_expr(varlen_m):
+            copy_A = copy_utils.gather_m_get_copy_fn(
+                thr_copy_A,
+                mA_mk,
+                sA,
+                gAIdx,
+                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                limit_k=len_k,
+            )
+        else:
+            copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                thr_copy_A,
+                mA_mk,
+                sA,
+                gAIdx,
+                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                limit_k=len_k,
+            )
+        return copy_A, prefetch_A
     @cute.jit
     def mma(
         self,
         ab_pipeline: cutlass.pipeline.PipelineAsync,
         ab_read_state: cutlass.pipeline.PipelineState,
+        mma_fn: Callable,
         acc: cute.Tensor,
         acc_slow: Optional[cute.Tensor],
         k_tile_cnt: Int32,
         warp_group_idx: Int32,
+    ) -> cutlass.pipeline.PipelineState:
+        # Prologue MMAs
         k_pipe_mmas = 1
         ab_release_state = ab_read_state.clone()
         num_prologue_mma = min(k_pipe_mmas, k_tile_cnt)
         peek_ab_full_status = Boolean(True)
         if 0 < k_tile_cnt:
             peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+        zero_init = Boolean(True)
         for k_tile in cutlass.range(num_prologue_mma):
             # Wait for A/B buffer to be ready
             ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
+            mma_fn(A_idx=ab_read_state.index, B_idx=ab_read_state.index, zero_init=zero_init)
+            zero_init = Boolean(False)
             ab_read_state.advance()
             peek_ab_full_status = Boolean(True)
             if k_tile + 1 < k_tile_cnt:
             warpgroup.wait_group(0)
             acc_slow.store(acc.load())
+        # MAINLOOP
         for k_tile in cutlass.range(num_prologue_mma, k_tile_cnt, unroll=1):
             # Wait for TMA copies to complete
             ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
             if const_expr(self.fp8_slow_accum):
+                zero_init = Boolean(True)
+            mma_fn(A_idx=ab_read_state.index, B_idx=ab_read_state.index, zero_init=zero_init)
+            zero_init = Boolean(False)
             # Wait on the wgmma barrier for previous k_pipe_mmas wgmmas to complete
             if const_expr(not self.fp8_slow_accum):
                 warpgroup.wait_group(k_pipe_mmas)
             ab_release_state.advance()
         if const_expr(self.fp8_slow_accum):
             acc.store(acc_slow.load())
+        return ab_read_state
     @cute.jit
     def epilogue(
         self,
         params: EpilogueParams,
         epi_smem_tensors: Tuple[cute.Tensor, ...],
         epi_pipeline: cutlass.pipeline.PipelineAsync,
         epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
     ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
+        # Setup postact output (returns None for default epilogue, context tuple for Act)
+        postact_ctx = self.epi_setup_postact(
+            params,
+            epi_smem_tensors,
+            tiled_copy_r2s,
+            tiled_copy_t2r,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
         epi_tile_shape = cute.zipped_divide(
             cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
         ).shape[1]
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
             # The global memory coordinate for the current epi tile
             gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
                 # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_view_async_shared()
                 cute.arch.sync_warp()
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
                     copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
+            tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            # Convert and store postact if this epilogue produces one
+            if const_expr(postact_ctx is not None):
+                tRS_rPostAct_out = self.epi_convert_postact(
+                    tRS_rPostAct,
+                    epi_loop_tensors["sr_seed"],
+                    tidx,
+                    tile_coord_mnkl,
+                    num_prev_subtiles,
+                    epi_idx,
+                )
+            if is_tma_warp:
+                epi_store_pipeline.producer_acquire()
+            epilogue_barrier.arrive_and_wait()
             # Copy from D registers to shared memory
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
             if const_expr(has_D):
+                if const_expr(
+                    self.rounding_mode == RoundingMode.RS
+                    and self.acc_dtype == cutlass.Float32
+                    and self.d_dtype == cutlass.BFloat16
+                ):
+                    seed = epi_loop_tensors["sr_seed"] + (
+                        tile_coord_mnkl[0] * 65537
+                        + tile_coord_mnkl[1] * 257
+                        + tile_coord_mnkl[3] * 17
+                        + (num_prev_subtiles + epi_idx) * 7
+                    )
+                    copy_utils.sr_cvt_copy(
+                        tiled_copy_r2s,
+                        tRS_rD,
+                        tRS_sD[None, None, None, epi_buffer],
+                        seed,
+                        tidx,
+                    )
+                else:
+                    copy_utils.cvt_copy(
+                        tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer]
+                    )
+            # Copy postact from registers to shared memory
+            if const_expr(postact_ctx is not None):
+                tiled_copy_postact_r2s, tRS_sPostAct, copy_postact = postact_ctx
+                cute.copy(
+                    tiled_copy_postact_r2s,
+                    tiled_copy_postact_r2s.retile(tRS_rPostAct_out),
+                    tRS_sPostAct[None, None, None, epi_buffer],
+                )
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_view_async_shared()
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(src_idx=epi_buffer, dst_idx=gmem_coord)
+                if const_expr(postact_ctx is not None):
+                    copy_postact(src_idx=epi_buffer, dst_idx=gmem_coord)
+                epi_store_pipeline.producer_commit()
         self.epi_end(
             params,
         mD: Optional[cute.Tensor],
         scheduler_args,
         varlen_args,
+        epilogue_args,
     ):
         """Create scheduler arguments. Override in subclasses for custom schedulers."""
+        if const_expr(not self.is_persistent):
+            persistence_mode = PersistenceMode.NONE
+        else:
+            if const_expr(self.arch >= 100 and self.use_clc_persistence):
+                persistence_mode = PersistenceMode.CLC
+            elif const_expr(scheduler_args.tile_count_semaphore is not None):
+                persistence_mode = PersistenceMode.DYNAMIC
+            else:
+                persistence_mode = PersistenceMode.STATIC
         if const_expr(varlen_args.mCuSeqlensM is None):
             num_problems = (
                 mD.shape[2]
                 )
             )
             problem_shape_ntile_mnl = (
+                cute.ceil_div(cute.size(mA, mode=[0]), self.cta_tile_shape_mnk[0]),
+                cute.ceil_div(cute.size(mB, mode=[0]), self.cta_tile_shape_mnk[1]),
                 num_problems,
             )
             tile_sched_args = TileSchedulerArguments(
                 cluster_shape_mnk=self.cluster_shape_mnk,
                 tile_count_semaphore=scheduler_args.tile_count_semaphore,
                 batch_idx_permute=scheduler_args.batch_idx_permute,
+                persistence_mode=persistence_mode,
             )
         else:
+            assert (mD is not None) or (epilogue_args.mPostAct is not None) or (not self.gather_A)
             problem_shape_ntile_mnl = (
                 None,
+                cute.ceil_div(cute.size(mB, mode=[0]), self.cta_tile_shape_mnk[1]),
                 varlen_args.mCuSeqlensM.shape[0] - 1,
             )
             tile_sched_args = VarlenMTileSchedulerArguments(
                 tile_shape_mn=self.cta_tile_shape_mnk[:2],
                 cluster_shape_mnk=self.cluster_shape_mnk,
                 tile_count_semaphore=scheduler_args.tile_count_semaphore,
+                persistence_mode=persistence_mode,
             )
         return tile_sched_args
+    def epi_retile_acc(self, acc, tRS_rD, tiled_copy_r2s):
+        """Retile accumulator for epilogue subtile access. SM90 uses flat_divide."""
+        return cute.flat_divide(acc, tRS_rD.layout)
     @cute.jit
     def epi_load_acc_subtile(self, tRS_rAcc: cute.Tensor, tRS_rD: cute.Tensor, epi_idx: int):
+        cute.autovec_copy(tRS_rAcc[None, None, None, epi_idx], tRS_rD)
     @cute.jit
     def epi_begin(
         """Subclasses can override this"""
         return []
     @staticmethod
     def epi_smem_bytes_per_stage(
         args: Optional[EpilogueArguments],
         tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
         sD_shape = sD.shape[:2] if sD is not None else self.epi_tile
         tRS_rD_shape = thr_copy_r2s.partition_S(cute.make_identity_tensor(sD_shape)).shape
+        tRS_rD = cute.make_rmem_tensor(tRS_rD_shape, self.acc_dtype)
         return tiled_copy_r2s, tRS_rD, tRS_sD
     def epilog_smem_load_and_partition(
         tiled_copy_s2r = cute.make_tiled_copy_S(copy_atom_s2r, tiled_copy_C_atom)
         thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
         tSR_sC = thr_copy_s2r.partition_S(sC)
+        tRS_rC = cute.make_rmem_tensor(tRS_rD_layout, dtype)
         tSR_rC = thr_copy_s2r.retile(tRS_rC)
         return tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC
         epi_tile: cute.Tile,
         sD: cute.Tensor,
         tile_coord_mnkl: cute.Coord,
     ) -> Tuple[cute.Tensor, cute.Tensor]:
         # (bM, bN)
         gD = cute.local_tile(mD_mn, tile_shape_mn, tile_coord_mnkl[:2])
             cta_layout=cute.make_layout(1),
             src_tensor=src_tensor,
             dst_tensor=dst_tensor,
         )
     def make_ab_pipeline(
             consumer_group=ab_pipeline_consumer_group,
             tx_count=self.num_tma_load_bytes,
             cta_layout_vmnk=cluster_layout_vmnk,
+            defer_sync=True,
         )
     def make_epi_pipeline(
             producer_group=epi_pipeline_producer_group,
             consumer_group=epi_pipeline_consumer_group,
             tx_count=tma_copy_c_bytes,
+            defer_sync=True,
         )
     def make_epi_store_pipeline(self):
         # Threads/warps participating in this pipeline
         sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
         cluster_size = cute.size(cluster_layout_mnk)
+        # Each warp will contribute 1 to the arrive count
         # If pingpong and varlen_k, then all 8 mma warps will participate in the scheduler barrier
         # at each round. If pingpong and not varlen_k, then only 4 mma warp will participate.
         consumer_arrive_cnt = (
             (self.mma_warp_groups if not (self.pingpong and not varlen_k) else 1) * 4
             + self.num_ab_load_warps
+        ) * cluster_size
         sched_pipeline_consumer_group = pipeline.CooperativeGroup(
             pipeline.Agent.Thread, consumer_arrive_cnt
         )
             consumer_group=sched_pipeline_consumer_group,
             # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
             consumer_mask=None if const_expr(cluster_size == 1) else 0,
+            defer_sync=True,
         )
     @classmethod
         epilogue_args: EpilogueArguments,
         smem_capacity: int,
         occupancy: int,
     ) -> Tuple[int, int]:
         """Computes the number of stages for A/B/C operands based on heuristics.
         """
         epi_stage = 4 if epi_tile[1] <= 16 else 2
+        d_bytes_per_stage = cute.size(epi_tile) * d_dtype.width // 8 if d_dtype is not None else 0
+        epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
+            epilogue_args, cta_tile_shape_mnk, epi_tile
+        )
+        epi_bytes = epi_bytes_per_stage * epi_stage
         epi_c_stage = 0 if c_dtype is None else (4 if epi_tile[1] <= 16 else 2)
         if c_dtype is not None:
             epi_bytes += cute.size(epi_tile) * c_dtype.width // 8 * epi_c_stage
         # Refine epilogue stages:
         # Calculate remaining smem after allocating for A/B stages and reserved bytes
         # Add remaining unused smem to epilogue
+        if epi_bytes_per_stage > 0:
             epi_stage += (remaining_bytes - ab_bytes_per_stage * ab_stage) // epi_bytes_per_stage
         return ab_stage, epi_stage, epi_c_stage
         :rtype: bool
         """
         is_valid = True
+        if a_dtype not in {Float16, cutlass.BFloat16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}:
             is_valid = False
         # tested b_dtype
+        if b_dtype not in {Float16, cutlass.BFloat16, cutlass.Float8E4M3FN, cutlass.Float8E5M2}:
             is_valid = False
         if acc_dtype not in {Float32, Float16}:
             is_valid = False

build/torch-cuda/quack/gemm_sq_reduce.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) 2025-2026, Tri Dao.
+# GEMM with column vector reduction of squared output and optional rowvec scaling:
+# D_raw = A @ B (+ C), reduce[m] = sum_n(D_raw[m,n]^2), D_out = D_raw * rowvec.
+from typing import NamedTuple, Optional
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from .cute_dsl_utils import (
+    mlir_namedtuple,
+    torch2cute_dtype_map,
+    get_device_capacity,
+    get_max_active_clusters,
+)
+from .epi_ops import ColVecReduce, colvec_reduce_accumulate, vec_multiply
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
+from .gemm_default_epi import GemmDefaultEpiMixin
+from .rounding import RoundingMode
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cache_utils import jit_cache
+from .gemm_tvm_ffi_utils import (
+    get_majors,
+    get_dtypes,
+    perm3d,
+    make_scheduler_args,
+    make_varlen_args,
+    make_fake_scheduler_args,
+    make_fake_varlen_args,
+    make_fake_gemm_tensors,
+    compile_gemm_kernel,
+)
+from . import utils as utils
+class GemmSqReduceMixin(GemmDefaultEpiMixin):
+    """GEMM + sq_reduce + optional rowvec scaling.
+    D_raw = A @ B (+ C), reduce[m] = sum_n(D_raw[m,n]^2), D_out = D_raw * rowvec.
+    The sq_sum is computed BEFORE the rowvec scaling.
+    """
+    _epi_ops = (*GemmDefaultEpiMixin._epi_ops, ColVecReduce("mColVecReduce"))
+    @mlir_namedtuple
+    class EpilogueArguments(NamedTuple):
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+        mColVecReduce: Optional[cute.Tensor] = None
+        add_to_output: cutlass.Constexpr[bool] = False
+        rounding_mode: cutlass.Constexpr[int] = RoundingMode.RN
+        sr_seed: None = None
+    # EpilogueParams auto-generated from _epi_ops
+    def epi_to_underlying_arguments(self, args, *, loc=None, ip=None):
+        self.rounding_mode = args.rounding_mode
+        d = self._epi_ops_to_params_dict(args)
+        return self.EpilogueParams(**d)
+    @cute.jit
+    def epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC=None):
+        tDrColVecReduce = epi_loop_tensors["mColVecReduce"]
+        tDrRowVec = epi_loop_tensors["mRowVecBroadcast"]
+        # Load accumulator, apply alpha/beta/C (skip rowvec/colvec — we handle rowvec below)
+        rD = tRS_rD.load()
+        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
+            alpha = utils.load_scalar_or_pointer(params.alpha)
+            rD *= alpha
+        if const_expr(tRS_rC is not None):
+            if const_expr(not hasattr(params, "beta") or params.beta is None):
+                rD += tRS_rC.load().to(tRS_rD.element_type)
+            else:
+                beta = utils.load_scalar_or_pointer(params.beta)
+                rD += beta * tRS_rC.load().to(tRS_rD.element_type)
+        tRS_rD.store(rD)
+        # Accumulate sq_sum BEFORE rowvec scaling: reduce[m] += sum_n(D[m,n]^2)
+        colvec_reduce_accumulate(self, tDrColVecReduce, tRS_rD, rScale=tRS_rD)
+        # Multiply by rowvec (norm_weight) AFTER sq_sum
+        vec_multiply(self, tRS_rD, None, tDrRowVec)
+        return None
+class GemmSqReduceSm90(GemmSqReduceMixin, GemmSm90):
+    pass
+class GemmSqReduceSm100(GemmSqReduceMixin, GemmSm100):
+    pass
+class GemmSqReduceSm120(GemmSqReduceMixin, GemmSm120):
+    pass
+@jit_cache
+def _compile_gemm_sq_reduce(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    colvec_reduce_dtype,
+    colvec_reduce_ndim,
+    rowvec_dtype,
+    device_capacity,
+):
+    sm_to_cls = {
+        9: GemmSqReduceSm90,
+        10: GemmSqReduceSm100,
+        11: GemmSqReduceSm100,
+        12: GemmSqReduceSm120,
+    }
+    GemmCls = sm_to_cls[device_capacity[0]]
+    mA, mB, mD, mC, m, n, k, l = make_fake_gemm_tensors(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+    )
+    n_tiles = cute.sym_int()
+    if colvec_reduce_ndim == 3:
+        mColVecReduce = fake_tensor(
+            colvec_reduce_dtype,
+            (l, m, n_tiles),
+            leading_dim=2,
+            divisibility=1,
+        )
+    else:
+        mColVecReduce = fake_tensor(
+            colvec_reduce_dtype,
+            (m, n_tiles),
+            leading_dim=1,
+            divisibility=1,
+        )
+    mRowVec = fake_tensor(rowvec_dtype, (l, n), leading_dim=1, divisibility=4)
+    epi_args = GemmCls.EpilogueArguments(
+        mRowVecBroadcast=mRowVec,
+        mColVecReduce=mColVecReduce,
+    )
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), False, l
+    )
+    varlen_args = make_fake_varlen_args(False, False, False, None)
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        False,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+    )
+def gemm_sq_reduce(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, n, k)
+    D: Tensor,  # (l, m, n)
+    C: Optional[Tensor],  # (l, m, n)
+    colvec_reduce: Tensor,  # (l, m, ceildiv(n, tile_n))
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    is_dynamic_persistent: bool = False,
+    max_swizzle_size: int = 8,
+    rowvec: Optional[Tensor] = None,  # (l, n) — norm_weight
+) -> None:
+    """GEMM + sq_reduce + optional rowvec scaling.
+    D_raw = A @ B (+ C), colvec_reduce[m] = sum_n(D_raw[m,n]^2), D_out = D_raw * rowvec.
+    """
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if device_capacity[0] == 12:
+        raise NotImplementedError("SM120 GEMM sq reduce epilogue is not yet supported")
+    A_p, B_p, D_p, C_p = perm3d(A, B, D, C)
+    a_major, b_major, d_major, c_major = get_majors(A_p, B_p, D_p, C_p)
+    a_dtype, b_dtype, d_dtype, c_dtype = get_dtypes(A, B, D, C)
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
+        )
+    compiled_fn = _compile_gemm_sq_reduce(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        a_major,
+        b_major,
+        d_major,
+        c_major,
+        (tile_M, tile_N),
+        (cluster_M, cluster_N, 1),
+        pingpong,
+        persistent,
+        is_dynamic_persistent,
+        torch2cute_dtype_map[colvec_reduce.dtype],
+        colvec_reduce.ndim,
+        torch2cute_dtype_map[rowvec.dtype] if rowvec is not None else None,
+        device_capacity,
+    )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    epi_args = GemmSqReduceMixin.EpilogueArguments(
+        mRowVecBroadcast=rowvec,
+        mColVecReduce=colvec_reduce,
+        add_to_output=None,  # Constexpr, pass None at runtime
+        rounding_mode=None,  # Constexpr, pass None at runtime
+    )
+    scheduler_args = make_scheduler_args(
+        max_active_clusters, max_swizzle_size, tile_count_semaphore
+    )
+    varlen_args = make_varlen_args(None, None, None)
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None, None, None)
+    else:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None)

build/torch-cuda/quack/gemm_symmetric.py CHANGED Viewed

@@ -1,25 +1,36 @@
 from typing import Tuple, Optional, Callable
-from functools import partial
 from torch import Tensor
-from .gemm_act import GemmActMixin, act_fn_map, gemm_act
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
 from .tile_scheduler import TriangularTileScheduler
-from .gemm_wrapper_utils import GemmWrapperBase
-from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
 from .varlen_utils import VarlenManager
 from . import copy_utils as copy_utils
-import cutlass
-import cutlass.cute as cute
-import cutlass.torch as cutlass_torch
-from cutlass.cute.runtime import make_ptr
-from cutlass import Int32, Float32, Boolean, const_expr
-import cutlass.utils.hopper_helpers as sm90_utils_og
-import cutlass.utils.blackwell_helpers as sm100_utils
-from cutlass.cutlass_dsl import if_generate
-class GemmSymmetricMixin(GemmActMixin, GemmSm90):
     def get_scheduler_class(self, varlen_m: bool = False):
         return TriangularTileScheduler
@@ -28,7 +39,6 @@ class GemmSymmetricMixin(GemmActMixin, GemmSm90):
         self,
         params: GemmActMixin.EpilogueParams,
         epi_smem_tensors: Tuple[cute.Tensor, ...],
-        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
         epi_pipeline: cutlass.pipeline.PipelineAsync,
         epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
@@ -55,31 +65,14 @@ class GemmSymmetricMixin(GemmActMixin, GemmSm90):
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
-        tma_atom_postact = params.tma_atom_postact
-        mPostAct_mnl = params.mPostAct_mnl
-        sRowVec, sColVec, sPostAct = epi_smem_tensors
-        get_smem_store_op = (
-            partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
-            if self.arch == 100
-            else sm90_utils_og.sm90_get_smem_store_op
-        )
-        copy_atom_postact_r2s = get_smem_store_op(
-            self.postact_layout, self.postact_dtype, self.acc_dtype
-        )
-        # tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
-        # tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
-        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
-        tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
-        (tma_desc_postact_ptr,) = tma_desc_epi_ptrs
-        batch_idx = tile_coord_mnkl[3]
-        copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
-            tma_atom_postact,
-            varlen_manager.offset_batch_epi(mPostAct_mnl, batch_idx),
-            self.cta_tile_shape_postact_mn,
-            params.epi_tile_postact,
-            sPostAct,
             tile_coord_mnkl,
-            tma_desc_ptr=tma_desc_postact_ptr,
         )
         # We iterate over epi tiles in the N dimension first before the M dimension
@@ -111,30 +104,6 @@ class GemmSymmetricMixin(GemmActMixin, GemmSm90):
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
-        def tma_store_fn(src_idx, dst_idx, tile_coord_mnkl):
-            pid_m = tile_coord_mnkl[0]
-            pid_n = tile_coord_mnkl[1]
-            # Fence and barrier to make sure shared memory store is visible to TMA store
-            cute.arch.fence_proxy(
-                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-            )
-            epilogue_barrier.arrive_and_wait()
-            # Copy from shared memory to global memory
-            if is_tma_warp:
-                square_tile_m = pid_m // self.cluster_shape_mnk[0]
-                square_tile_n = pid_n // self.cluster_shape_mnk[1]
-                if const_expr(has_D):
-                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
-                if square_tile_m != square_tile_n:  # don't write twice to the same tile
-                    copy_postact(src_idx=src_idx, dst_idx=dst_idx)
-            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
-            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
-            epilogue_barrier.arrive_and_wait()
-        delay_tma_store = True
-        src_idx_prev, dst_idx_prev = None, None
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
             # The global memory coordinate for the current epi tile
             gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
@@ -145,9 +114,7 @@ class GemmSymmetricMixin(GemmActMixin, GemmSm90):
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
                 # Fence to make sure shared memory read is visible to TMA load
-                cute.arch.fence_proxy(
-                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
-                )
                 cute.arch.sync_warp()
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
@@ -160,30 +127,61 @@ class GemmSymmetricMixin(GemmActMixin, GemmSm90):
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
             tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
-            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
-            if const_expr(delay_tma_store):
-                if const_expr(epi_idx > 0):
-                    tma_store_fn(
-                        src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
-                    )
-                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
             # Copy from D registers to shared memory
             if const_expr(has_D):
-                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
             cute.copy(
                 tiled_copy_postact_r2s,
-                tiled_copy_postact_r2s.retile(tRS_rPostAct),
                 tRS_sPostAct[None, None, None, epi_buffer],
             )
-            if const_expr(not delay_tma_store):
-                tma_store_fn(
-                    src_idx=epi_buffer, dst_idx=gmem_coord, tile_coord_mnkl=tile_coord_mnkl
-                )
-        if const_expr(delay_tma_store):
-            tma_store_fn(
-                src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
-            )
         self.epi_end(
             params,
@@ -207,6 +205,97 @@ class GemmSymmetricSm100(GemmSymmetricMixin, GemmSm100):
     pass
 def gemm_symmetric(
     A: Tensor,  # (l, m, k)
     B: Tensor,  # (l, m, k)
@@ -219,112 +308,87 @@ def gemm_symmetric(
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
     max_swizzle_size: int = 8,
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
 ) -> None:
-    # Tranpose D so the "activation" is a write to the mirrored tile
     PostAct = D.mT
-    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
-        A, B, D, C, additional_tensors={"PostAct": PostAct}
-    )
-    assert M == N, "M and N must be the same; symmetric gemm only supports square matrices"
-    GemmWrapperBase.permute_tensors(tensor_infos)
-    GemmWrapperBase.extract_dtypes(tensor_infos)
-    major_configs = {
-        "A": ("m", "k", "l"),
-        "B": ("n", "k", "l"),
-        "D": ("m", "n", "l"),
-        "C": ("m", "n", "l"),
-        "PostAct": ("m", "n", "l"),
-    }
-    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
     device_capacity = get_device_capacity(A.device)
-    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
-    GemmCls = GemmSymmetricSm90 if device_capacity[0] == 9 else GemmSymmetricSm100
-    acc_dtype = Float32
     tile_shape_mn = (tile_M, tile_N)
     cluster_shape_mnk = (cluster_M, cluster_N, 1)
-    if not GemmCls.is_valid_dtypes(
-        tensor_infos["A"].dtype,
-        tensor_infos["B"].dtype,
-        acc_dtype,
-        tensor_infos["D"].dtype,
-        tensor_infos["A"].major,
-        tensor_infos["B"].major,
-    ):
-        raise TypeError("Skipping due to unsupported combination of types and majors")
-    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
-    GemmWrapperBase.create_cute_tensors({k: v for k, v in tensor_infos.items()}, major_configs)
-    def scalar_arg(scalar: float | Tensor):
-        if isinstance(scalar, float):
-            return Float32(scalar) if scalar != 1.0 else None
-        else:
-            assert isinstance(scalar, Tensor)
-            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
-    activation = None  # Equivalent to identity
-    act_fn = act_fn_map[activation]
-    epi_args = GemmCls.EpilogueArguments(
-        tensor_infos["PostAct"].cute_tensor, act_fn, scalar_arg(alpha), scalar_arg(beta)
-    )
-    scheduler_args = GemmWrapperBase.create_scheduler_args(
-        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
-    )
-    varlen_args = None
-    current_stream = cutlass_torch.current_stream()
-    compile_key = GemmWrapperBase.get_compile_key(
-        tensor_infos,
-        activation,
         tile_shape_mn,
         cluster_shape_mnk,
         pingpong,
         persistent,
-        tile_count_semaphore is not None,
         device_capacity,
-        max_swizzle_size,
-        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
-        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
-        key_tensor_names=("A", "B", "D", "PostAct", "C"),
-    )
-    cache = gemm_act.compile_cache
-    if compile_key not in cache:
-        if device_capacity[0] == 9:
-            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
-        gemm_obj = GemmCls(
-            acc_dtype,
-            tensor_infos["A"].dtype,
-            tile_shape_mn,
-            cluster_shape_mnk,
-            gather_A=False,
-        )
-        cache[compile_key] = cute.compile(
-            gemm_obj,
-            tensor_infos["A"].cute_tensor,
-            tensor_infos["B"].cute_tensor,
-            tensor_infos["D"].cute_tensor,
-            tensor_infos["C"].cute_tensor,
-            epi_args,
-            scheduler_args,
-            varlen_args,
-            current_stream,
-        )
-    cache[compile_key](
-        tensor_infos["A"].cute_tensor,
-        tensor_infos["B"].cute_tensor,
-        tensor_infos["D"].cute_tensor,
-        tensor_infos["C"].cute_tensor,
-        epi_args,
-        scheduler_args,
-        varlen_args,
-        current_stream,
     )
-gemm_act.compile_cache = {}

 from typing import Tuple, Optional, Callable
 from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass.cute.runtime import make_ptr
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters, torch2cute_dtype_map
+from .activation import act_fn_map
+from .gemm_act import GemmActMixin
 from .gemm_sm90 import GemmSm90
 from .gemm_sm100 import GemmSm100
+from .gemm_sm120 import GemmSm120
+from .gemm_tvm_ffi_utils import (
+    div_for_dtype,
+    perm3d,
+    get_majors,
+    get_dtypes,
+    make_scheduler_args,
+    make_fake_scheduler_args,
+    compile_gemm_kernel,
+)
+from .cache_utils import jit_cache
 from .tile_scheduler import TriangularTileScheduler
 from .varlen_utils import VarlenManager
 from . import copy_utils as copy_utils
+from .rounding import RoundingMode
+class GemmSymmetricMixin(GemmActMixin):
     def get_scheduler_class(self, varlen_m: bool = False):
         return TriangularTileScheduler
         self,
         params: GemmActMixin.EpilogueParams,
         epi_smem_tensors: Tuple[cute.Tensor, ...],
         epi_pipeline: cutlass.pipeline.PipelineAsync,
         epi_store_pipeline: cutlass.pipeline.PipelineAsync,
         epi_read_state: cutlass.pipeline.PipelineState,
         has_C = const_expr(tRS_rC is not None)
         has_D = const_expr(copy_D is not None)
+        tiled_copy_postact_r2s, tRS_sPostAct, copy_postact = self.epi_setup_postact(
+            params,
+            epi_smem_tensors,
+            tiled_copy_r2s,
+            tiled_copy_t2r,
             tile_coord_mnkl,
+            varlen_manager,
+            tidx,
         )
         # We iterate over epi tiles in the N dimension first before the M dimension
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
         for epi_idx in cutlass.range_constexpr(epi_tile_num):
             # The global memory coordinate for the current epi tile
             gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
                 epi_pipeline.consumer_wait(epi_read_state)
                 cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
                 # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_view_async_shared()
                 cute.arch.sync_warp()
                 with cute.arch.elect_one():
                     epi_pipeline.consumer_release(epi_read_state)
                     epi_pipeline.producer_commit(epi_producer_state)
                 epi_producer_state.advance()
             tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            tRS_rPostAct_out = self.epi_convert_postact(
+                tRS_rPostAct,
+                epi_loop_tensors["sr_seed"],
+                tidx,
+                tile_coord_mnkl,
+                num_prev_subtiles,
+                epi_idx,
+            )
+            if is_tma_warp:
+                epi_store_pipeline.producer_acquire()
+            epilogue_barrier.arrive_and_wait()
             # Copy from D registers to shared memory
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
             if const_expr(has_D):
+                if const_expr(
+                    self.rounding_mode == RoundingMode.RS
+                    and self.acc_dtype == cutlass.Float32
+                    and self.d_dtype == cutlass.BFloat16
+                ):
+                    seed = epi_loop_tensors["sr_seed"] + (
+                        tile_coord_mnkl[0] * 65537
+                        + tile_coord_mnkl[1] * 257
+                        + tile_coord_mnkl[3] * 17
+                        + (num_prev_subtiles + epi_idx) * 7
+                    )
+                    copy_utils.sr_cvt_copy(
+                        tiled_copy_r2s,
+                        tRS_rD,
+                        tRS_sD[None, None, None, epi_buffer],
+                        seed,
+                        tidx,
+                    )
+                else:
+                    copy_utils.cvt_copy(
+                        tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer]
+                    )
             cute.copy(
                 tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct_out),
                 tRS_sPostAct[None, None, None, epi_buffer],
             )
+            pid_m = tile_coord_mnkl[0]
+            pid_n = tile_coord_mnkl[1]
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_view_async_shared()
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                square_tile_m = pid_m // self.cluster_shape_mnk[0]
+                square_tile_n = pid_n // self.cluster_shape_mnk[1]
+                if const_expr(has_D):
+                    copy_D(src_idx=epi_buffer, dst_idx=gmem_coord)
+                if square_tile_m != square_tile_n:  # don't write twice to the same tile
+                    copy_postact(src_idx=epi_buffer, dst_idx=gmem_coord)
+                epi_store_pipeline.producer_commit()
         self.epi_end(
             params,
     pass
+class GemmSymmetricSm120(GemmSymmetricMixin, GemmSm120):
+    pass
+@jit_cache
+def _compile_gemm_symmetric(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    c_major,
+    postact_dtype,
+    a_major,
+    b_major,
+    d_major,
+    postact_major,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    is_dynamic_persistent,
+    alpha_mode,
+    beta_mode,
+    device_capacity,
+):
+    sm_to_cls = {
+        9: GemmSymmetricSm90,
+        10: GemmSymmetricSm100,
+        11: GemmSymmetricSm100,
+        12: GemmSymmetricSm120,
+    }
+    GemmCls = sm_to_cls[device_capacity[0]]
+    # Symmetric GEMM: m == n, so reuse the same sym_int for shape checking
+    m, k, l = cute.sym_int(), cute.sym_int(), cute.sym_int()
+    a_leading = 1 if a_major == "k" else 0
+    b_leading = 1 if b_major == "k" else 0
+    d_leading = 1 if d_major == "n" else 0
+    c_leading = 1 if c_major == "n" else 0
+    div_a, div_b = div_for_dtype(a_dtype), div_for_dtype(b_dtype)
+    div_d, div_c = div_for_dtype(d_dtype), div_for_dtype(c_dtype) if c_dtype else 1
+    mA = fake_tensor(a_dtype, (m, k, l), leading_dim=a_leading, divisibility=div_a)
+    mB = fake_tensor(b_dtype, (m, k, l), leading_dim=b_leading, divisibility=div_b)
+    mD = fake_tensor(d_dtype, (m, m, l), leading_dim=d_leading, divisibility=div_d)
+    mC = fake_tensor(c_dtype, (m, m, l), leading_dim=c_leading, divisibility=div_c)
+    # PostAct = D.mT, so it has the opposite major from D (m↔n swapped)
+    div_pa = div_for_dtype(postact_dtype)
+    postact_leading = 1 if postact_major == "n" else 0
+    mPostAct = fake_tensor(
+        postact_dtype, (m, m, l), leading_dim=postact_leading, divisibility=div_pa
+    )
+    def fake_scalar(mode):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return Float32(1.0)
+        else:
+            return make_ptr(Float32, 0, cute.AddressSpace.gmem, assumed_align=4)
+    activation = None  # identity
+    act_fn = act_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(
+        mPostAct,
+        act_fn,
+        alpha=fake_scalar(alpha_mode),
+        beta=fake_scalar(beta_mode),
+    )
+    scheduler_args = make_fake_scheduler_args(
+        (is_dynamic_persistent and device_capacity[0] == 9), False, l
+    )
+    varlen_args = None
+    return compile_gemm_kernel(
+        GemmCls,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        False,
+        is_dynamic_persistent,
+        device_capacity,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+    )
 def gemm_symmetric(
     A: Tensor,  # (l, m, k)
     B: Tensor,  # (l, m, k)
     cluster_N: int,
     pingpong: bool = False,
     persistent: bool = True,
+    is_dynamic_persistent: bool = False,
     max_swizzle_size: int = 8,
     alpha: float | Tensor = 1.0,
     beta: float | Tensor = 1.0,
 ) -> None:
+    # Transpose D so the "activation" is a write to the mirrored tile
     PostAct = D.mT
+    A_p, B_p, D_p, C_p = perm3d(A, B, D, C)
+    PostAct_p = PostAct.permute(1, 2, 0) if PostAct.ndim == 3 else PostAct
+    a_major, b_major, d_major, c_major = get_majors(A_p, B_p, D_p, C_p)
+    a_dtype, b_dtype, d_dtype, c_dtype = get_dtypes(A, B, D, C)
+    postact_dtype = torch2cute_dtype_map[PostAct.dtype]
+    # PostAct = D.mT has swapped major: if D is n-major, PostAct is m-major
+    postact_major = "n" if PostAct_p.stride(1) == 1 else "m"
     device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
+    if is_dynamic_persistent and device_capacity[0] == 9:
+        assert tile_count_semaphore is not None, (
+            "Dynamic persistent tile scheduler in SM90 requires a semaphore in GMEM"
+        )
     tile_shape_mn = (tile_M, tile_N)
     cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    alpha_mode = 2 if isinstance(alpha, Tensor) else (1 if alpha != 1.0 else 0)
+    beta_mode = 2 if isinstance(beta, Tensor) else (1 if beta != 1.0 else 0)
+    compiled_fn = _compile_gemm_symmetric(
+        a_dtype,
+        b_dtype,
+        d_dtype,
+        c_dtype,
+        c_major,
+        postact_dtype,
+        a_major,
+        b_major,
+        d_major,
+        postact_major,
         tile_shape_mn,
         cluster_shape_mnk,
         pingpong,
         persistent,
+        is_dynamic_persistent,
+        alpha_mode,
+        beta_mode,
         device_capacity,
     )
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    def scalar_arg(scalar, mode):
+        if mode == 0:
+            return None
+        elif mode == 1:
+            return Float32(scalar)
+        else:
+            return scalar.data_ptr()
+    epi_args = GemmActMixin.EpilogueArguments(
+        PostAct_p,
+        None,  # act_fn is Constexpr, baked in at compile time
+        alpha=scalar_arg(alpha, alpha_mode),
+        beta=scalar_arg(beta, beta_mode),
+        rounding_mode=None,
+        sr_seed=None,
+    )
+    scheduler_args = make_scheduler_args(
+        max_active_clusters,
+        max_swizzle_size,
+        tile_count_semaphore,
+    )
+    varlen_args = None
+    if device_capacity[0] in [10, 11]:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None, None, None)
+    else:
+        compiled_fn(A_p, B_p, D_p, C_p, epi_args, scheduler_args, varlen_args, None)

build/torch-cuda/quack/gemm_tvm_ffi_utils.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# Copyright (c) 2025, Tri Dao.
+# Shared utilities for TVM-FFI GEMM compilation.
+from functools import partial
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float32
+from cutlass.cute.runtime import make_ptr
+from .compile_utils import make_fake_tensor as fake_tensor
+from .cute_dsl_utils import torch2cute_dtype_map
+from .tile_scheduler import TileSchedulerOptions
+from .varlen_utils import VarlenArguments
+def div_for_dtype(dtype):
+    """16-byte alignment: divisibility in elements = 128 // dtype_width_bits."""
+    return 128 // dtype.width
+def perm3d_single(t, varlen_m=False):
+    """Permute a single 3D tensor from (L, *, *) to (*, *, L), skipping for varlen_m or 2D."""
+    return t.permute(1, 2, 0) if t is not None and t.ndim == 3 and not varlen_m else t
+def perm3d(A, B, D, C, varlen_m=False, varlen_k=False):
+    """Permute 3D tensors from (L, *, *) to (*, *, L)."""
+    def _perm(t):
+        return t.permute(1, 2, 0) if t is not None and t.ndim == 3 else t
+    if varlen_m:
+        return A, _perm(B), D, C
+    elif varlen_k:
+        return A, B, _perm(D), _perm(C)
+    else:
+        return _perm(A), _perm(B), _perm(D), _perm(C)
+def get_major(t, dim0, dim1):
+    return dim1 if t.stride(1) == 1 else dim0
+def get_majors(A_p, B_p, D_p, C_p):
+    a_major = get_major(A_p, "m", "k")
+    b_major = get_major(B_p, "n", "k")
+    d_major = get_major(D_p, "m", "n")
+    c_major = get_major(C_p, "m", "n") if C_p is not None else None
+    return a_major, b_major, d_major, c_major
+def get_dtypes(A, B, D, C):
+    a_dtype = torch2cute_dtype_map[A.dtype]
+    b_dtype = torch2cute_dtype_map[B.dtype]
+    d_dtype = torch2cute_dtype_map[D.dtype]
+    c_dtype = torch2cute_dtype_map[C.dtype] if C is not None else None
+    return a_dtype, b_dtype, d_dtype, c_dtype
+def make_scheduler_args(
+    max_active_clusters, max_swizzle_size, tile_count_semaphore, batch_idx_permute=None
+):
+    return TileSchedulerOptions(
+        max_active_clusters=Int32(max_active_clusters),
+        raster_order=None,
+        max_swizzle_size=max_swizzle_size,
+        tile_count_semaphore=(
+            tile_count_semaphore.data_ptr() if tile_count_semaphore is not None else None
+        ),
+        batch_idx_permute=batch_idx_permute,
+    )
+def make_fake_scheduler_args(has_semaphore, has_batch_idx_permute, l_sym):
+    return TileSchedulerOptions(
+        max_active_clusters=Int32(1),
+        max_swizzle_size=Int32(8),
+        tile_count_semaphore=(
+            make_ptr(Int32, 0, cute.AddressSpace.gmem, assumed_align=4) if has_semaphore else None
+        ),
+        batch_idx_permute=(
+            fake_tensor(Int32, (l_sym,), leading_dim=0, divisibility=4)
+            if has_batch_idx_permute
+            else None
+        ),
+    )
+def make_varlen_args(cu_seqlens_m, cu_seqlens_k, A_idx):
+    if cu_seqlens_m is None and cu_seqlens_k is None:
+        return None
+    return VarlenArguments(
+        mCuSeqlensM=cu_seqlens_m,
+        mCuSeqlensK=cu_seqlens_k,
+        mAIdx=A_idx,
+    )
+def make_fake_varlen_args(varlen_m, varlen_k, gather_A, aidx_len):
+    if not varlen_m and not varlen_k:
+        return None
+    num_seqlens = cute.sym_int()
+    return VarlenArguments(
+        mCuSeqlensM=(
+            fake_tensor(Int32, (num_seqlens,), leading_dim=0, divisibility=4) if varlen_m else None
+        ),
+        mCuSeqlensK=(
+            fake_tensor(Int32, (num_seqlens,), leading_dim=0, divisibility=4) if varlen_k else None
+        ),
+        mAIdx=(
+            fake_tensor(Int32, (aidx_len,), leading_dim=0, divisibility=4) if gather_A else None
+        ),
+    )
+def make_fake_gemm_tensors(
+    a_dtype,
+    b_dtype,
+    d_dtype,
+    c_dtype,
+    a_major,
+    b_major,
+    d_major,
+    c_major,
+    varlen_m=False,
+    varlen_k=False,
+    gather_A=False,
+):
+    """Create fake tensors for mA, mB, mD, mC with shared sym_ints.
+    Pass dtype=None to get None for that tensor (e.g. optional C).
+    Returns (mA, mB, mD, mC, m, n, k, l).
+    When varlen_m, m is total_m (flattened M of D/C). When varlen_k, k is total_k.
+    """
+    a_leading = 1 if a_major == "k" else 0
+    b_leading = 1 if b_major == "k" else 0
+    d_leading = 1 if d_major == "n" else 0
+    c_leading = 1 if c_major == "n" else 0
+    m, n, k, l = cute.sym_int(), cute.sym_int(), cute.sym_int(), cute.sym_int()
+    div_a = div_for_dtype(a_dtype)
+    div_b = div_for_dtype(b_dtype)
+    div_d = div_for_dtype(d_dtype) if d_dtype is not None else 1
+    div_c = div_for_dtype(c_dtype) if c_dtype is not None else 1
+    if varlen_m:
+        # m is total_m in this case: the flattened M dimension of D/C
+        m = cute.sym_int()
+        a_m = cute.sym_int() if gather_A else m
+        mA = fake_tensor(a_dtype, (a_m, k), leading_dim=a_leading, divisibility=div_a)
+        mB = fake_tensor(b_dtype, (n, k, l), leading_dim=b_leading, divisibility=div_b)
+        mD = fake_tensor(d_dtype, (m, n), leading_dim=d_leading, divisibility=div_d)
+        mC = fake_tensor(c_dtype, (m, n), leading_dim=c_leading, divisibility=div_c)
+    elif varlen_k:
+        # k is total_k in this case: the flattened K dimension of A/B
+        k = cute.sym_int()
+        a_k = cute.sym_int() if gather_A else k
+        mA = fake_tensor(a_dtype, (m, a_k), leading_dim=a_leading, divisibility=div_a)
+        mB = fake_tensor(b_dtype, (n, k), leading_dim=b_leading, divisibility=div_b)
+        mD = fake_tensor(d_dtype, (m, n, l), leading_dim=d_leading, divisibility=div_d)
+        mC = fake_tensor(c_dtype, (m, n, l), leading_dim=c_leading, divisibility=div_c)
+    else:
+        mA = fake_tensor(a_dtype, (m, k, l), leading_dim=a_leading, divisibility=div_a)
+        mB = fake_tensor(b_dtype, (n, k, l), leading_dim=b_leading, divisibility=div_b)
+        mD = fake_tensor(d_dtype, (m, n, l), leading_dim=d_leading, divisibility=div_d)
+        mC = fake_tensor(c_dtype, (m, n, l), leading_dim=c_leading, divisibility=div_c)
+    return mA, mB, mD, mC, m, n, k, l
+def compile_gemm_kernel(
+    GemmCls,
+    a_dtype,
+    tile_shape_mn,
+    cluster_shape_mnk,
+    pingpong,
+    persistent,
+    gather_A,
+    is_dynamic_persistent,
+    device_capacity,
+    mA,
+    mB,
+    mD,
+    mC,
+    epi_args,
+    scheduler_args,
+    varlen_args,
+    post_init=None,
+    mSFA=None,
+    mSFB=None,
+    has_trace_ptr=False,
+    use_tma_gather=False,
+    concat_layout=None,
+):
+    """Build GemmCls instance, apply SM90 partial, and cute.compile with TVM-FFI."""
+    if device_capacity[0] in [9, 12]:
+        GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+    elif device_capacity[0] in [10, 11]:
+        GemmCls = partial(
+            GemmCls,
+            use_clc_persistence=is_dynamic_persistent,
+            use_tma_gather=use_tma_gather,
+        )
+    gemm_obj = GemmCls(
+        Float32,
+        a_dtype,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        gather_A=gather_A,
+        concat_layout=concat_layout,
+    )
+    if post_init:
+        post_init(gemm_obj)
+    stream = cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True)
+    sf_args = () if device_capacity[0] in (9, 12) else (mSFA, mSFB)
+    # Trace pointer: Optional[Int64]. Compile with Int64(0) when tracing is
+    # requested, None otherwise. TVM-FFI caches each variant separately.
+    trace_ptr = Int64(0) if has_trace_ptr else None
+    return cute.compile(
+        gemm_obj,
+        mA,
+        mB,
+        mD,
+        mC,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        stream,
+        *sf_args,
+        trace_ptr,
+        options="--enable-tvm-ffi",
+    )

build/torch-cuda/quack/gemm_wrapper_utils.py DELETED Viewed

@@ -1,317 +0,0 @@
-# Copyright (c) 2025, Tri Dao.
-from typing import Optional, Tuple, Dict, Any
-from dataclasses import dataclass
-import torch
-from torch import Tensor
-import cutlass.cute as cute
-from cutlass import Int32
-from cutlass.cute.runtime import from_dlpack, make_ptr
-from .cute_dsl_utils import torch2cute_dtype_map
-from .varlen_utils import VarlenArguments
-from .tile_scheduler import TileSchedulerOptions
-@dataclass
-class GemmTensorInfo:
-    tensor: Optional[Tensor]
-    dtype: Optional[Any] = None
-    major: Optional[str] = None
-    cute_tensor: Optional[cute.Tensor] = None
-class GemmWrapperBase:
-    @staticmethod
-    def validate_tensor(tensor: Tensor, name: str, ndim: int) -> None:
-        assert tensor.dim() == ndim and tensor.is_cuda, f"{name} must be a {ndim}D CUDA tensor"
-        assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
-    @staticmethod
-    def validate_shape(tensor: Tensor, expected_shape: Tuple[int, ...], name: str) -> None:
-        assert tensor.shape == expected_shape, (
-            f"{name} must have shape {expected_shape}, got {tensor.shape}"
-        )
-    @staticmethod
-    def get_major_order(tensor: Tensor, dims: Tuple[str, str, str]) -> str:
-        # Tensor is already permuted to (dims[0], dims[1], dims[2])
-        # stride(1) == 1 means dims[1] is contiguous (innermost)
-        return dims[1] if tensor.stride(1) == 1 else dims[0]
-    @staticmethod
-    def create_cute_tensor(
-        tensor: Optional[Tensor],
-        major: Optional[str],
-        dims: Tuple[str, str, str],
-        assumed_align: int = 16,
-    ) -> Optional[cute.Tensor]:
-        if tensor is None:
-            return None
-        # Tensor is already permuted to (dims[0], dims[1], dims[2]) or (dim[0], dim[1])
-        # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
-        leading_dim = 1 if major == dims[1] else 0
-        return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
-            leading_dim=leading_dim
-        )
-    @staticmethod
-    def validate_and_prepare_tensors(
-        A: Tensor,
-        B: Tensor,
-        D: Optional[Tensor] = None,
-        C: Optional[Tensor] = None,
-        additional_tensors: Optional[Dict[str, Tensor]] = None,
-        cu_seqlens_m: Optional[Tensor] = None,
-        cu_seqlens_k: Optional[Tensor] = None,
-        A_idx: Optional[Tensor] = None,
-    ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
-        assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
-            "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
-        )
-        assert B.dtype == A.dtype, "A and B must have the same dtype"
-        # Validate A_idx if provided (for gather_A case)
-        gather_A = A_idx is not None
-        if gather_A:
-            assert cu_seqlens_m is not None or cu_seqlens_k is not None, (
-                "gather_A requires either varlen_m or varlen_k"
-            )
-            assert A_idx.dtype == torch.int32, f"A_idx must be int32, got {A_idx.dtype}"
-            assert A_idx.dim() == 1, f"A_idx must be 1D, got {A_idx.dim()}D"
-        # Determine mode and extract dimensions
-        if cu_seqlens_m is not None:
-            # varlen_m: A is (total_m, k) or (whatever, k) if gather_A, B is (l, n, k), D/C are (total_m, n)
-            assert A.dim() == 2, f"A must be 2D when using varlen_m, got {A.dim()}D"
-            assert B.dim() == 3, f"B must be 3D with varlen_m, got {B.dim()}D"
-            if gather_A:
-                # When gather_A, A can have any number of rows, we use A_idx.shape[0] as total_M
-                total_M = A_idx.shape[0]
-                _, K = A.shape
-            else:
-                total_M, K = A.shape
-            L, N, K_B = B.shape
-            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
-            assert cu_seqlens_m.shape == (L + 1,), (
-                f"cu_seqlens_m must have shape ({L + 1},), got {cu_seqlens_m.shape}"
-            )
-            M = total_M
-            dc_shape = (total_M, N)
-            dc_ndim = 2
-        elif cu_seqlens_k is not None:
-            # varlen_k: A is (m, total_k) or (m, whatever) if gather_A, B is (n, total_k), D/C are (l, m, n)
-            assert A.dim() == 2, f"A must be 2D when using varlen_k, got {A.dim()}D"
-            assert B.dim() == 2, f"B must be 2D with varlen_k, got {B.dim()}D"
-            if gather_A:
-                # When gather_A with varlen_k, A can have any number of columns, we use A_idx.shape[0] as total_K
-                M, _ = A.shape
-                total_K = A_idx.shape[0]
-            else:
-                M, total_K = A.shape
-            N, K_B = B.shape
-            assert total_K == K_B, f"K dimension mismatch: expected {total_K}, B has {K_B}"
-            L = cu_seqlens_k.shape[0] - 1
-            assert cu_seqlens_k.shape == (L + 1,), (
-                f"cu_seqlens_k must have shape ({L + 1},), got {cu_seqlens_k.shape}"
-            )
-            K = total_K
-            dc_shape = (L, M, N)
-            dc_ndim = 3
-        else:
-            # Normal case - all tensors must be 3D
-            GemmWrapperBase.validate_tensor(A, "A", 3)
-            GemmWrapperBase.validate_tensor(B, "B", 3)
-            L, M, K = A.shape
-            _, N, K_B = B.shape
-            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
-            GemmWrapperBase.validate_shape(B, (L, N, K), "B")
-            dc_shape = (L, M, N)
-            dc_ndim = 3
-        # Validate D and C shapes uniformly
-        for tensor, name in [(D, "D"), (C, "C")]:
-            if tensor is not None:
-                assert tensor.dim() == dc_ndim, (
-                    f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
-                )
-                assert tensor.shape == dc_shape, (
-                    f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
-                )
-        tensors = {
-            "A": GemmTensorInfo(A),
-            "B": GemmTensorInfo(B),
-            "D": GemmTensorInfo(D),
-            "C": GemmTensorInfo(C),
-        }
-        if additional_tensors:
-            for name, tensor in additional_tensors.items():
-                if tensor is not None:
-                    assert tensor.dim() == dc_ndim, (
-                        f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
-                    )
-                    assert tensor.shape == dc_shape, (
-                        f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
-                    )
-                tensors[name] = GemmTensorInfo(tensor)
-        return L, M, K, N, tensors
-    @staticmethod
-    def permute_tensors(
-        tensors: Dict[str, GemmTensorInfo], varlen_m: bool = False, varlen_k: bool = False
-    ) -> None:
-        # Determine which tensors need permutation
-        if varlen_m:
-            # Only B needs permutation (3D tensor)
-            tensors_to_permute = ["B"]
-        elif varlen_k:
-            # Only D and C need permutation (3D tensors)
-            tensors_to_permute = ["D", "C"]
-        else:
-            # All tensors need permutation
-            tensors_to_permute = None
-        # Apply permutation from (L, *, *) -> (*, *, L) for selected tensors
-        for name, info in tensors.items():
-            if info.tensor is not None and info.tensor.ndim == 3:
-                if tensors_to_permute is None or name in tensors_to_permute:
-                    info.tensor = info.tensor.permute(1, 2, 0)
-    @staticmethod
-    def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
-        for name, info in tensors.items():
-            if info.tensor is not None:
-                info.dtype = torch2cute_dtype_map[info.tensor.dtype]
-    @staticmethod
-    def determine_major_orders(
-        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
-    ) -> None:
-        for name, dims in major_configs.items():
-            if name in tensors and tensors[name].tensor is not None:
-                tensors[name].major = GemmWrapperBase.get_major_order(tensors[name].tensor, dims)
-    @staticmethod
-    def create_cute_tensors(
-        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
-    ) -> None:
-        for name, info in tensors.items():
-            if info.tensor is not None and name in major_configs:
-                info.cute_tensor = GemmWrapperBase.create_cute_tensor(
-                    info.tensor, info.major, major_configs[name]
-                )
-    @staticmethod
-    def create_scheduler_args(
-        max_active_clusters: int,
-        tile_count_semaphore: Optional[Tensor] = None,
-        batch_idx_permute: Optional[Tensor] = None,
-        max_swizzle_size: int = 8,
-    ) -> TileSchedulerOptions:
-        return TileSchedulerOptions(
-            Int32(max_active_clusters),
-            tile_count_semaphore=make_ptr(
-                Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
-            )
-            if tile_count_semaphore is not None
-            else None,
-            batch_idx_permute=(
-                from_dlpack(batch_idx_permute, assumed_align=4).mark_layout_dynamic(leading_dim=0)
-            )
-            if batch_idx_permute is not None
-            else None,
-            max_swizzle_size=Int32(max_swizzle_size),
-        )
-    @staticmethod
-    def create_varlen_args(
-        cu_seqlens_m: Optional[Tensor],
-        cu_seqlens_k: Optional[Tensor],
-        A_idx: Optional[Tensor],
-        max_active_clusters: int,
-        cluster_shape_mnk: Tuple[int, int, int],
-        tensors: Dict[str, GemmTensorInfo],
-        num_epi_tensormaps: int = 0,
-        pingpong: bool = False,
-    ) -> Optional[Any]:
-        if cu_seqlens_m is None and cu_seqlens_k is None:
-            return None
-        # When varlen_m, we assume persistent=True
-        # Grid size depends on num_active_clusters and cluster size
-        cluster_size = cluster_shape_mnk[0] * cluster_shape_mnk[1]
-        num_blocks = max_active_clusters * cluster_size
-        # Calculate number of tensormaps needed
-        if cu_seqlens_m is not None:
-            # For varlen_m: need tensormaps for D and epilogue tensors
-            num_tensormaps = num_epi_tensormaps * (1 if not pingpong else 2)
-            if tensors["D"].tensor is not None:
-                num_tensormaps += 1 if not pingpong else 2  # D tensormap
-        else:
-            # For varlen_k: need tensormaps for A & B
-            num_tensormaps = 2 if A_idx is None else 1
-        # Create tensormap buffer (each tensormap is 128 bytes = 16 int64s)
-        tensormap_size = 128 // 8  # 16 int64s
-        if num_tensormaps > 0:
-            device = cu_seqlens_m.device if cu_seqlens_m is not None else cu_seqlens_k.device
-            tensormaps = torch.empty(
-                (num_blocks, num_tensormaps, tensormap_size),
-                dtype=torch.int64,
-                device=device,
-            )
-            tensormaps_cute = from_dlpack(tensormaps, assumed_align=128).mark_compact_shape_dynamic(
-                mode=0, stride_order=(0, 1, 2)
-            )
-        else:
-            tensormaps_cute = None
-        return VarlenArguments(
-            mCuSeqlensM=(
-                from_dlpack(cu_seqlens_m, assumed_align=4).mark_layout_dynamic(leading_dim=0)
-                if cu_seqlens_m is not None
-                else None
-            ),
-            mCuSeqlensK=(
-                from_dlpack(cu_seqlens_k, assumed_align=4).mark_layout_dynamic(leading_dim=0)
-                if cu_seqlens_k is not None
-                else None
-            ),
-            mTensormaps=tensormaps_cute,
-            mAIdx=(
-                from_dlpack(A_idx, assumed_align=4).mark_layout_dynamic(leading_dim=0)
-                if A_idx is not None
-                else None
-            ),
-        )
-    @staticmethod
-    def get_compile_key(
-        tensors: Dict[str, GemmTensorInfo],
-        activation: Optional[str],
-        tile_shape_mn: Tuple[int, int],
-        cluster_shape_mnk: Tuple[int, int, int],
-        pingpong: bool,
-        persistent: bool,
-        has_semaphore: bool,
-        *args,
-        key_tensor_names: Tuple[str, ...] = ("A", "B", "D", "C"),
-    ) -> Tuple:
-        key_parts = []
-        for name in key_tensor_names:
-            if name in tensors:
-                key_parts.append(tensors[name].dtype)
-        key_parts.append(activation)
-        key_parts.extend([tile_shape_mn, cluster_shape_mnk])
-        for name in key_tensor_names:
-            if name in tensors:
-                key_parts.append(tensors[name].major)
-        key_parts.extend([pingpong, persistent, has_semaphore])
-        key_parts.extend(args)
-        return tuple(key_parts)

build/torch-cuda/quack/layout_utils.py CHANGED Viewed

@@ -6,8 +6,6 @@ import cutlass.cute as cute
 from cutlass import Int32, const_expr
-from .utils import prmt
 def transpose_view(a: cute.Tensor) -> cute.Tensor:
     """Transpose the first two dimensions of a tensor on smem."""
@@ -20,6 +18,19 @@ def select(a: cute.Tensor, mode: list[int]) -> cute.Tensor:
     return cute.make_tensor(a.iterator, cute.select(a.layout, mode))
 def expand(a: cute.Tensor, dim: int, size: Int32 | int) -> cute.Tensor:
     shape = (*a.shape[:dim], size, *a.shape[dim:])
     stride = (*a.layout.stride[:dim], 0, *a.layout.stride[dim:])
@@ -55,8 +66,8 @@ def permute_gated_Cregs_b16(t: cute.Tensor) -> None:
         lower0 = lower if lane_03 else upper
         upper0 = cute.arch.shuffle_sync(upper0, offset=upper_idx, mask_and_clamp=mask_and_clamp)
         lower0 = cute.arch.shuffle_sync(lower0, offset=lower_idx, mask_and_clamp=mask_and_clamp)
-        t_u32[i * 2 + 0] = prmt(upper0, lower0, selector_upper)
-        t_u32[i * 2 + 1] = prmt(upper0, lower0, selector_lower)
 @cute.jit
@@ -154,41 +165,43 @@ def concat_layout(*layouts: cute.Layout) -> cute.Layout:
     )
-def convert_layout_acc_mn(acc_layout: cute.Layout) -> cute.Layout:
     """
     For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
     For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
     """
     acc_layout_col_major = cute.make_layout(acc_layout.shape)
-    acc_layout_mn = cute.make_layout(
         (
-            (acc_layout_col_major.shape[0][1], acc_layout_col_major.shape[1]),  # MMA_M
-            (
-                acc_layout_col_major.shape[0][0],
-                *acc_layout_col_major.shape[0][2:],
-                acc_layout_col_major.shape[2],
-            ),  # MMA_N
-            *acc_layout_col_major.shape[3:],
-        ),
-        stride=(
-            (acc_layout_col_major.stride[0][1], acc_layout_col_major.stride[1]),  # MMA_M
-            (
-                acc_layout_col_major.stride[0][0],
-                *acc_layout_col_major.stride[0][2:],
-                acc_layout_col_major.stride[2],
-            ),  # MMA_N
-            *acc_layout_col_major.stride[3:],
-        ),
     )
     return cute.composition(acc_layout, acc_layout_mn)
-def make_acc_tensor_mn_view(acc: cute.Tensor) -> cute.Tensor:
-    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
-def reshape_acc_to_mn(acc: cute.Tensor) -> cute.Tensor:
-    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
 @cute.jit
@@ -196,10 +209,12 @@ def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
     # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
     # For Sm80, as the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
     # For Sm90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
     # TODO: Sm90 FP8
     if const_expr(cute.rank(acc_layout.shape[0]) == 3):  # Sm90
         l = cute.logical_divide(
-            acc_layout, ((None, None, 2), None, None)
         )  # ((2, 2, (2, N / 16)), MMA_M, MMA_N)
         rA_mma_view = cute.make_layout(
             (
@@ -293,3 +308,77 @@ def mma_partition_A_vec(
     sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
     tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
     return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

 from cutlass import Int32, const_expr
 def transpose_view(a: cute.Tensor) -> cute.Tensor:
     """Transpose the first two dimensions of a tensor on smem."""
     return cute.make_tensor(a.iterator, cute.select(a.layout, mode))
+def concat_to_interleave(a: cute.Tensor, dim: int) -> cute.Tensor:
+    """Reshape a concat [first_half; second_half] layout to interleaved along `dim`.
+    Splits dimension `dim` (size 2N) into hierarchical (2, N) so that elements
+    from the first half and second half alternate: [first_0, second_0, first_1, ...].
+    Used to convert gated MLP weight layout from concat [gate; up] to interleaved.
+    """
+    half = cute.size(a, mode=[dim]) // 2
+    shape = (*a.shape[:dim], (2, half), *a.shape[dim + 1 :])
+    stride = (*a.stride[:dim], (half * a.stride[dim], a.stride[dim]), *a.stride[dim + 1 :])
+    return cute.make_tensor(a.iterator, cute.make_layout(shape, stride=stride))
 def expand(a: cute.Tensor, dim: int, size: Int32 | int) -> cute.Tensor:
     shape = (*a.shape[:dim], size, *a.shape[dim:])
     stride = (*a.layout.stride[:dim], 0, *a.layout.stride[dim:])
         lower0 = lower if lane_03 else upper
         upper0 = cute.arch.shuffle_sync(upper0, offset=upper_idx, mask_and_clamp=mask_and_clamp)
         lower0 = cute.arch.shuffle_sync(lower0, offset=lower_idx, mask_and_clamp=mask_and_clamp)
+        t_u32[i * 2 + 0] = cute.arch.prmt(upper0, lower0, selector_upper)
+        t_u32[i * 2 + 1] = cute.arch.prmt(upper0, lower0, selector_lower)
 @cute.jit
     )
+def convert_layout_acc_mn(acc_layout: cute.Layout, transpose: bool = False) -> cute.Layout:
     """
     For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
     For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
     """
     acc_layout_col_major = cute.make_layout(acc_layout.shape)
+    shape = (
+        (acc_layout_col_major.shape[0][1], acc_layout_col_major.shape[1]),  # MMA_M
         (
+            acc_layout_col_major.shape[0][0],
+            *acc_layout_col_major.shape[0][2:],
+            acc_layout_col_major.shape[2],
+        ),  # MMA_N
+        *acc_layout_col_major.shape[3:],
+    )
+    stride = (
+        (acc_layout_col_major.stride[0][1], acc_layout_col_major.stride[1]),  # MMA_M
+        (
+            acc_layout_col_major.stride[0][0],
+            *acc_layout_col_major.stride[0][2:],
+            acc_layout_col_major.stride[2],
+        ),  # MMA_N
+        *acc_layout_col_major.stride[3:],
     )
+    if const_expr(transpose):
+        shape = (shape[1], shape[0], *shape[2:])
+        stride = (stride[1], stride[0], *stride[2:])
+    acc_layout_mn = cute.make_layout(shape, stride=stride)
     return cute.composition(acc_layout, acc_layout_mn)
+def make_acc_tensor_mn_view(acc: cute.Tensor, transpose: bool = False) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout, transpose=transpose))
+def reshape_acc_to_mn(acc: cute.Tensor, transpose: bool = False) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout, transpose=transpose))
 @cute.jit
     # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
     # For Sm80, as the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
     # For Sm90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
+    # If N / 8 is odd, we'll convert to ((2, 2, 1), MMA_M, N / 8, MMA_N).
     # TODO: Sm90 FP8
     if const_expr(cute.rank(acc_layout.shape[0]) == 3):  # Sm90
+        div = 2 if const_expr(acc_layout.shape[0][2] % 2 == 0) else 1
         l = cute.logical_divide(
+            acc_layout, ((None, None, div), None, None)
         )  # ((2, 2, (2, N / 16)), MMA_M, MMA_N)
         rA_mma_view = cute.make_layout(
             (
     sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
     tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
     return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def copy_partition_S_vec(
+    sVec: cute.Tensor, thr_copy: cute.core.ThrCopy, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_thr = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = reshape_acc_to_mn(thr_copy.partition_S(sVec_thr))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def copy_partition_D_vec(
+    sVec: cute.Tensor, thr_copy: cute.core.ThrCopy, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_thr = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = reshape_acc_to_mn(thr_copy.partition_D(sVec_thr))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def tile_atom_to_shape_SF_strided(
+    shape: cute.Shape,
+    sf_vec_size: int,
+    sf_strides,
+) -> cute.Layout:
+    """Build an SFA/SFB layout matching `shape` (A or B operand shape) but
+    honoring the scale tensor's actual strides instead of hardcoded packed
+    ones.
+    Mirrors `cutlass.utils.blockscaled_layout.tile_atom_to_shape_SF(shape,
+    sf_vec_size)`, except outer-mode strides come from `sf_strides` (pass
+    `mSFA.stride` / `mSFB.stride` directly). The inner 512-B atom
+    `((32, 4), (sf_vec_size, 4)) : ((16, 4), (0, 1))` is hardware-fixed.
+    Implementation uses `cute.blocked_product(atom, outer)`; `blocked_product`
+    scales the outer layout's strides by `cosize(atom) == 512`, so we divide
+    the byte strides by 512 (one tile) before handing them in.
+    Args:
+        shape: A/B operand shape. Rank-3 `(m/n, k, l)` or rank-2
+            `(total_mn, k)` (varlen_m).
+        sf_vec_size: Scale factor vector size (16 or 32).
+        sf_strides: Strides of the scale tensor, which has logical shape
+            `(L, rmn, rk, 512)` (rank 4). Only `sf_strides[0..2]` are used:
+            `sf_strides[1]` as the rmn stride, `sf_strides[2]` as the rk
+            stride, and `sf_strides[0]` as the L stride (only for rank-3
+            `shape`).
+    """
+    from cutlass.utils.blockscaled_layout import BlockScaledBasicChunk
+    atom = BlockScaledBasicChunk(sf_vec_size).layout
+    rmn = cute.ceil_div(shape[0], 128)
+    rk = cute.ceil_div(shape[1], sf_vec_size * 4)
+    outer = cute.make_layout((rmn, rk), stride=(sf_strides[1] // 512, sf_strides[2] // 512))
+    sf_layout = cute.blocked_product(atom, outer)
+    if const_expr(len(shape) == 3):
+        sf_layout = cute.append(sf_layout, cute.make_layout(shape[2], stride=sf_strides[0]))
+    return sf_layout

build/torch-cuda/quack/linear.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# Copyright (c) 2025, Tri Dao
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from .gemm_interface import gemm, gemm_add_inplace, gemm_act, gemm_dact
+from .gemm_interface import gemm_gated, gemm_dgated
+from .gemm_interface import act_to_pytorch_fn_map, gated_to_pytorch_fn_map
+def _ensure_contiguous(t):
+    """Ensure last-dim stride is 1. Under torch.compile use unconditional .contiguous()
+    (dynamo can't inspect strides on fake tensors); otherwise check first to avoid copies.
+    """
+    if torch.compiler.is_compiling():
+        return t.contiguous()
+    return t if t.stride(-1) == 1 else t.contiguous()
+def linear_fwd_convert_type(*tensors):
+    autocast_dtype = torch.get_autocast_dtype("cuda")
+    if torch.is_autocast_enabled():
+        tensors = tuple(t.to(dtype=autocast_dtype) for t in tensors)
+    return tensors
+def linear_fwd_postprocess(ctx, x, weight, weight_og, needs_x_w_grad):
+    needs_input_grad, needs_weight_grad = needs_x_w_grad
+    if not needs_input_grad:
+        weight, weight_og = None, None
+    if not needs_weight_grad:
+        x = None
+    ctx.save_for_backward(x, weight, weight_og if ctx.fuse_grad_accum else None)
+def linear_bwd_compute_input_grad(ctx, dout, weight, matmul_fn):
+    if ctx.needs_input_grad[0]:
+        assert weight is not None
+        return matmul_fn(dout, weight)
+    else:
+        return None
+def linear_bwd_compute_weight_grad(ctx, dout, x, weight_og, matmul_fn, matmul_inplace_fn):
+    if ctx.needs_input_grad[1]:
+        assert x is not None
+        x = x.reshape(-1, x.shape[-1])
+        # fuse_grad_accum is not compatible with torch.compile
+        if not ctx.fuse_grad_accum or weight_og.grad is None or torch.compiler.is_compiling():
+            dweight = matmul_fn(dout.T, x, out_dtype=ctx.weight_dtype)
+        else:
+            # print("Using fuse grad accum in Linear", dout.shape, x.shape, weight_og.grad.shape)
+            matmul_inplace_fn(dout.T, x, weight_og.grad)
+            dweight = weight_og.grad
+            weight_og.grad = None  # So that pytorch doesn't add dweight to weight_og.grad again
+    else:
+        dweight = None
+    return dweight
+def _recompute_act_postact(preact, activation):
+    """Recompute postact from preact using the activation function (no GEMM)."""
+    return act_to_pytorch_fn_map[activation](preact)
+def _recompute_gated_postact(preact, activation):
+    """Recompute gated postact from interleaved preact (no GEMM)."""
+    return gated_to_pytorch_fn_map[activation](preact[..., ::2], preact[..., 1::2])
+# --- Ops bundles: matmul function configurations ---
+# Each ops class is a namespace holding the matmul functions for a specific variant
+# (tuned/untuned, act/gated, etc.). Passed as a non-tensor arg to apply() and stored on ctx.
+class _LinearOps:
+    matmul_fwd_fn = gemm
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+class _LinearUntunedOps(_LinearOps):
+    matmul_fwd_fn = partial(gemm, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+class _LinearActOps(_LinearOps):
+    matmul_fwd_fn = gemm_act
+class _LinearActUntunedOps(_LinearUntunedOps):
+    matmul_fwd_fn = partial(gemm_act, tuned=False)
+class _LinearGatedOps(_LinearOps):
+    matmul_fwd_fn = gemm_gated
+class _LinearGatedUntunedOps:
+    matmul_fwd_fn = partial(gemm_gated, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True, tuned=False)
+class _LinearGatedConcatOps(_LinearGatedOps):
+    matmul_fwd_fn = partial(gemm_gated, concat_layout=("B", "bias"))
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, concat_layout=("B",))
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, concat_layout=("out",))
+    matmul_bwd_dw_inplace = partial(
+        gemm_add_inplace, dynamic_scheduler=True, concat_layout=("C", "out")
+    )
+class _LinearGatedConcatUntunedOps(_LinearGatedUntunedOps):
+    matmul_fwd_fn = partial(gemm_gated, tuned=False, concat_layout=("B", "bias"))
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False, concat_layout=("B",))
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False, concat_layout=("out",))
+    matmul_bwd_dw_inplace = partial(
+        gemm_add_inplace, dynamic_scheduler=True, tuned=False, concat_layout=("C", "out")
+    )
+class _DActLinearOps(_LinearOps):
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True)
+    recompute_postact = staticmethod(_recompute_act_postact)
+class _DActLinearUntunedOps(_LinearUntunedOps):
+    matmul_bwd_dx = partial(gemm_dact, dynamic_scheduler=True, tuned=False)
+    recompute_postact = staticmethod(_recompute_act_postact)
+class _DGatedLinearOps(_LinearOps):
+    matmul_bwd_dx = partial(gemm_dgated, dynamic_scheduler=True)
+    recompute_postact = staticmethod(_recompute_gated_postact)
+class _DGatedLinearUntunedOps(_LinearUntunedOps):
+    matmul_bwd_dx = partial(gemm_dgated, dynamic_scheduler=True, tuned=False)
+    recompute_postact = staticmethod(_recompute_gated_postact)
+# --- Autograd Functions (all @staticmethod, torch.compile-compatible) ---
+class LinearFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, fuse_grad_accum, ops):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        bias: (out_features,) or None
+        out: (..., out_features)
+        """
+        # Convert types while autocast is still enabled, then disable it for the body.
+        x, weight = linear_fwd_convert_type(x, weight)
+        with torch.amp.autocast("cuda", enabled=False):
+            ctx.weight_dtype = weight.dtype
+            ctx.fuse_grad_accum = fuse_grad_accum
+            ctx.ops = ops
+            weight_og = weight
+            batch_shape = x.shape[:-1]
+            x = x.reshape(-1, x.shape[-1])
+            out = ops.matmul_fwd_fn(x, weight.T, bias=bias)
+            linear_fwd_postprocess(
+                ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2]
+            )
+            ctx.bias_dtype = bias.dtype if bias is not None else None
+            ctx.compute_dbias = bias is not None and ctx.needs_input_grad[2]
+            return out.reshape(*batch_shape, out.shape[-1])
+    @staticmethod
+    def backward(ctx, dout):
+        """
+        dout: (..., out_features)
+        """
+        with torch.amp.autocast("cuda", enabled=False):
+            ops = ctx.ops
+            x, weight, weight_og = ctx.saved_tensors  # weight_og is None if not ctx.fuse_grad_accum
+            batch_shape = dout.shape[:-1]
+            dout = _ensure_contiguous(dout.reshape(-1, dout.shape[-1]))
+            dbias = dout.sum(0, dtype=ctx.bias_dtype) if ctx.compute_dbias else None
+            dx = linear_bwd_compute_input_grad(ctx, dout, weight, ops.matmul_bwd_dx)
+            dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
+            dweight = linear_bwd_compute_weight_grad(
+                ctx, dout, x, weight_og, ops.matmul_bwd_dw, ops.matmul_bwd_dw_inplace
+            )
+            return dx, dweight, dbias, None, None
+def linear_func(x, weight, bias=None, fuse_grad_accum=False, tuned=True):
+    ops = _LinearOps if tuned else _LinearUntunedOps
+    return LinearFunc.apply(x, weight, bias, fuse_grad_accum, ops)
+class LinearActFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, activation, bias, store_preact, fuse_grad_accum, ops):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        bias: (out_features,) or None
+        out: (..., out_features)
+        Return both out and post-activation, but only out is differentiable.
+        """
+        x, weight = linear_fwd_convert_type(x, weight)
+        with torch.amp.autocast("cuda", enabled=False):
+            ctx.weight_dtype = weight.dtype
+            ctx.fuse_grad_accum = fuse_grad_accum
+            ctx.ops = ops
+            weight_og = weight
+            batch_shape = x.shape[:-1]
+            x = x.reshape(-1, x.shape[-1])
+            out, postact = ops.matmul_fwd_fn(
+                x, weight.T, bias=bias, activation=activation, store_preact=store_preact
+            )
+            linear_fwd_postprocess(
+                ctx, x, weight, weight_og, needs_x_w_grad=ctx.needs_input_grad[:2]
+            )
+            if out is not None:
+                out = out.reshape(*batch_shape, out.shape[-1])
+            ctx.bias_dtype = bias.dtype if bias is not None else None
+            ctx.compute_dbias = bias is not None and ctx.needs_input_grad[3]
+            ctx.mark_non_differentiable(postact)
+            ctx.set_materialize_grads(False)  # We don't want to materialize grads for postact
+            return out, postact.reshape(*batch_shape, postact.shape[-1])
+    @staticmethod
+    def backward(ctx, dout, *args):
+        with torch.amp.autocast("cuda", enabled=False):
+            ops = ctx.ops
+            x, weight, weight_og = ctx.saved_tensors
+            batch_shape = dout.shape[:-1]
+            dout = _ensure_contiguous(dout.reshape(-1, dout.shape[-1]))
+            dbias = dout.sum(0, dtype=ctx.bias_dtype) if ctx.compute_dbias else None
+            dx = linear_bwd_compute_input_grad(ctx, dout, weight, ops.matmul_bwd_dx)
+            dx = dx.reshape(*batch_shape, dx.shape[-1]) if dx is not None else None
+            dweight = linear_bwd_compute_weight_grad(
+                ctx, dout, x, weight_og, ops.matmul_bwd_dw, ops.matmul_bwd_dw_inplace
+            )
+            return dx, dweight, None, dbias, None, None, None
+def linear_act_func(
+    x, weight, activation, bias=None, store_preact=True, fuse_grad_accum=False, tuned=True
+):
+    ops = _LinearActOps if tuned else _LinearActUntunedOps
+    return LinearActFunc.apply(x, weight, activation, bias, store_preact, fuse_grad_accum, ops)
+def linear_gated_func(
+    x,
+    weight,
+    activation,
+    bias=None,
+    store_preact=True,
+    fuse_grad_accum=False,
+    tuned=True,
+    concat_layout=False,
+):
+    if concat_layout:
+        ops = _LinearGatedConcatOps if tuned else _LinearGatedConcatUntunedOps
+    else:
+        ops = _LinearGatedOps if tuned else _LinearGatedUntunedOps
+    return LinearActFunc.apply(x, weight, activation, bias, store_preact, fuse_grad_accum, ops)
+class DActLinearFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, preact, weight, x, activation, bias, fuse_grad_accum, ops):
+        """
+        x: (..., in_features)
+        weight: (out_features, in_features)
+        bias: (out_features,) or None
+        out: (..., out_features)
+        Takes in an extra preact argument which is the pre-activation, to be used in the backward pass.
+        """
+        x, weight = linear_fwd_convert_type(x, weight)
+        with torch.amp.autocast("cuda", enabled=False):
+            ctx.weight_dtype = weight.dtype
+            ctx.fuse_grad_accum = fuse_grad_accum
+            ctx.ops = ops
+            weight_og = weight
+            batch_shape = x.shape[:-1]
+            x = x.reshape(-1, x.shape[-1])
+            out = ops.matmul_fwd_fn(x, weight.T, bias=bias)
+            # Store preact instead of x, we will recompute x (postact) in backward.
+            # dpreact needs gemm_dact(dout, weight, preact) → needs both weight and preact.
+            # dweight needs postact: if dpreact is also needed, postact comes from gemm_dact;
+            # otherwise we can recompute postact = act(preact) cheaply without weight.
+            need_preact = ctx.needs_input_grad[0] or ctx.needs_input_grad[1]
+            need_weight = ctx.needs_input_grad[0]  # only gemm_dact needs weight
+            linear_fwd_postprocess(
+                ctx, preact, weight, weight_og, needs_x_w_grad=(need_weight, need_preact)
+            )
+            ctx.activation = activation
+            ctx.bias_dtype = bias.dtype if bias is not None else None
+            ctx.compute_dbias = bias is not None and ctx.needs_input_grad[4]
+            return out.reshape(*batch_shape, out.shape[-1])
+    @staticmethod
+    def backward(ctx, dout):
+        """
+        dout: (..., out_features)
+        """
+        with torch.amp.autocast("cuda", enabled=False):
+            ops = ctx.ops
+            # weight_og is None if not ctx.fuse_grad_accum
+            preact, weight, weight_og = ctx.saved_tensors
+            batch_shape = dout.shape[:-1]
+            dout = _ensure_contiguous(dout.reshape(-1, dout.shape[-1]))
+            dbias = dout.sum(0, dtype=ctx.bias_dtype) if ctx.compute_dbias else None
+            if ctx.needs_input_grad[0]:
+                # Need dpreact: gemm_dact(dout, weight, preact) → (dpreact, postact)
+                preact = preact.reshape(-1, preact.shape[-1])
+                assert weight is not None
+                dpreact, x = ops.matmul_bwd_dx(dout, weight, preact, activation=ctx.activation)
+            elif ctx.needs_input_grad[1]:
+                # Only need dweight: recompute postact from preact cheaply (no GEMM needed)
+                preact = preact.reshape(-1, preact.shape[-1])
+                x = ops.recompute_postact(preact, ctx.activation)
+                dpreact = None
+            else:
+                dpreact, x = None, None
+            dpreact = (
+                dpreact.reshape(*batch_shape, dpreact.shape[-1]) if dpreact is not None else None
+            )
+            dweight = linear_bwd_compute_weight_grad(
+                ctx, dout, x, weight_og, ops.matmul_bwd_dw, ops.matmul_bwd_dw_inplace
+            )
+            return dpreact, dweight, None, None, dbias, None, None
+def act_linear_func(preact, weight, x, activation, bias=None, fuse_grad_accum=False, tuned=True):
+    ops = _DActLinearOps if tuned else _DActLinearUntunedOps
+    return DActLinearFunc.apply(preact, weight, x, activation, bias, fuse_grad_accum, ops)
+def gated_linear_func(preact, weight, x, activation, bias=None, fuse_grad_accum=False, tuned=True):
+    ops = _DGatedLinearOps if tuned else _DGatedLinearUntunedOps
+    return DActLinearFunc.apply(preact, weight, x, activation, bias, fuse_grad_accum, ops)
+class Linear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        device=None,
+        dtype=None,
+        fuse_grad_accum: bool = False,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.fuse_grad_accum = fuse_grad_accum
+    def forward(self, input: Tensor) -> Tensor:
+        if input.is_cuda and self.in_features % 8 == 0 and self.out_features % 8 == 0:
+            return linear_func(input, self.weight, self.bias, fuse_grad_accum=self.fuse_grad_accum)
+        else:
+            return F.linear(input, self.weight, self.bias)

build/torch-cuda/quack/linear_cross_entropy.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) 2025, Tri Dao
+from typing import Optional, Literal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.amp import custom_fwd, custom_bwd
+from .cross_entropy import cross_entropy, cross_entropy_fwd_out
+from .gemm_interface import gemm, gemm_add, gemm_add_inplace
+from .linear import linear_fwd_convert_type
+def linear_cross_entropy_func(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+    inplace_backward: bool = False,
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return cross_entropy(
+        y, target, ignore_index=ignore_index, reduction=reduction, inplace_backward=inplace_backward
+    )
+def linear_cross_entropy_func_ref(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return F.cross_entropy(y, target, ignore_index=ignore_index, reduction=reduction)
+def chunked_linear_cross_entropy_fwd(
+    x: Tensor,  # (B*L, d) where B is batch, L is seqlen
+    weight: Tensor,  # (V, d) where V is vocab size
+    target: Tensor,  # (B*L,)
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    tuned: bool = True,
+) -> tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    """
+    Chunked forward pass for linear cross entropy.
+    Splits input along batch dimension, computes matmul and cross_entropy_fwd
+    for each chunk, stores dx for each chunk, and accumulates dw.
+    Returns:
+        loss: (B*L,) loss values
+        dx: (B*L, d) gradient w.r.t. input
+        dw: (V, d) gradient w.r.t. weight (accumulated across chunks except last)
+        last_dlogits_chunk: (chunk_len, V) gradient of last chunk's logits (for deferred dw computation)
+        last_x_chunk: (chunk_len, d) last chunk's input (for deferred dw computation)
+    """
+    B_L, d = x.shape
+    V, _ = weight.shape
+    device = x.device
+    num_chunks = (B_L + chunk_size - 1) // chunk_size
+    # Since we use gemm with TMA we require some alignment
+    assert chunk_size % 8 == 0, "chunk_size must be multiple of 8"
+    assert B_L % 8 == 0
+    # Pre-allocate outputs
+    loss = torch.empty(B_L, device=device, dtype=torch.float32)
+    logits_chunk_preallocated = torch.empty((chunk_size, V), device=device, dtype=x.dtype)
+    dx = torch.empty_like(x)
+    # Last chunk of dw will be deferred to the backward pass
+    dw = torch.empty_like(weight, dtype=torch.float32) if num_chunks > 1 else None
+    last_dlogits_chunk = None
+    last_x_chunk = None
+    # Process in chunks
+    for i, (x_chunk, target_chunk, loss_chunk, dx_chunk) in enumerate(
+        zip(*(t.split(chunk_size) for t in (x, target, loss, dx)))
+    ):
+        chunk_len = x_chunk.shape[0]
+        logits_chunk = logits_chunk_preallocated[:chunk_len]  # (chunk_len, V)
+        torch.mm(x_chunk, weight.mT, out=logits_chunk)
+        # Compute cross entropy forward with gradients
+        dlogits_chunk = logits_chunk  # inplace_backward
+        cross_entropy_fwd_out(
+            logits_chunk,
+            target_chunk,
+            None,  # target_logit
+            loss=loss_chunk,
+            lse=None,  # we don't need lse here
+            dx=dlogits_chunk,
+            ignore_index=ignore_index,
+        )
+        # Compute dx for this chunk: dlogits @ weight
+        torch.mm(dlogits_chunk, weight, out=dx_chunk)  # (chunk_len, d)
+        # Compute dw for all chunks except the last
+        if i == num_chunks - 1:
+            # Last chunk: save for backward pass
+            last_dlogits_chunk = dlogits_chunk
+            last_x_chunk = x_chunk
+        elif i == 0:
+            # First chunk: dw = dlogits.T @ x_chunk
+            gemm(dlogits_chunk.T, x_chunk, out=dw, tuned=tuned)
+        else:
+            # Middle chunks: dw += dlogits.T @ x_chunk
+            gemm_add_inplace(dlogits_chunk.T, x_chunk, dw, tuned=tuned)
+    return loss, dx, dw, last_dlogits_chunk, last_x_chunk
+class ChunkedLinearCrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        x: Tensor,
+        weight: Tensor,
+        target: Tensor,
+        ignore_index: int = -100,
+        reduction: Literal["mean", "sum"] = "mean",
+        chunk_size: int = 4096,
+        tuned: bool = True,
+    ):
+        """
+        Forward pass computes loss and stores dx and dw for backward.
+        """
+        ctx.weight_dtype = weight.dtype
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        # TODO: don't need to compute bwd if neither x nor weight requires grad, or not training
+        loss, dx, dw, last_dlogits_chunk, last_x_chunk = chunked_linear_cross_entropy_fwd(
+            x, weight, target, chunk_size, ignore_index, tuned=tuned
+        )
+        loss_sum = loss.sum()
+        loss_scale = None if reduction == "sum" else 1.0 / (target != ignore_index).sum().float()
+        ctx.save_for_backward(dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale)
+        ctx.batch_shape = batch_shape
+        ctx.ignore_index = ignore_index
+        ctx.reduction = reduction
+        ctx.tuned = tuned
+        return loss_sum if loss_scale is None else loss_sum * loss_scale
+    @staticmethod
+    @custom_bwd(device_type="cuda")
+    def backward(ctx, dloss):
+        """
+        Backward pass scales pre-computed gradients by dloss and completes
+        the last chunk's dw computation.
+        dloss is a scalar.
+        """
+        dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale = ctx.saved_tensors
+        tuned = ctx.tuned
+        if loss_scale is not None:
+            dloss = dloss * loss_scale
+        # TODO: the case where x or weight doesn't require grad
+        dx.mul_(dloss)
+        dx = dx.reshape(*ctx.batch_shape, dx.shape[-1])
+        # Complete dw computation: dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+        if dw is None:
+            # Only had one chunk, compute dw directly with dloss scaling
+            dw = gemm(
+                last_dlogits_chunk.T,
+                last_x_chunk,
+                out_dtype=ctx.weight_dtype,
+                alpha=dloss,
+                tuned=tuned,
+            )
+        else:
+            # Add last chunk's contribution with dloss scaling
+            # dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+            # We use alpha=dloss, beta=dloss
+            if ctx.weight_dtype == dw.dtype:
+                gemm_add_inplace(
+                    last_dlogits_chunk.T, last_x_chunk, dw, alpha=dloss, beta=dloss, tuned=tuned
+                )
+            else:
+                dw = gemm_add(
+                    last_dlogits_chunk.T,
+                    last_x_chunk,
+                    dw,
+                    alpha=dloss,
+                    beta=dloss,
+                    out_dtype=ctx.weight_dtype,
+                    tuned=tuned,
+                )
+        return dx, dw, None, None, None, None, None
+def chunked_linear_cross_entropy(
+    x: Tensor,
+    weight: Tensor,
+    target: Tensor,
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    reduction: Literal["mean", "sum"] = "mean",
+    tuned: bool = True,
+) -> Tensor:
+    """
+    Chunked linear cross entropy with automatic differentiation support.
+    Args:
+        x: Input tensor of shape (B*L, d)
+        weight: Weight tensor of shape (V, d)
+        target: Target indices of shape (B*L,)
+        chunk_size: Size of chunks to process
+        ignore_index: Index to ignore in loss computation
+        reduction: Type of reduction to apply
+        tuned: Whether to use tuned kernels
+    Returns:
+        Loss tensor with specified reduction
+    """
+    if reduction not in ["mean", "sum"]:
+        raise ValueError(f"Invalid reduction: {reduction}")
+    loss = ChunkedLinearCrossEntropyFunction.apply(
+        x, weight, target, ignore_index, reduction, chunk_size, tuned
+    )
+    return loss
+class LinearCrossEntropy(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        ignore_index: int = -100,
+        reduction: Literal["none", "mean", "sum"] = "mean",
+        chunk_size: Optional[int] = None,
+        inplace_backward: bool = False,
+        tuned: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.chunk_size = chunk_size
+        self.inplace_backward = inplace_backward
+        self.tuned = tuned
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if (
+            self.bias is None
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.in_features % 8 == 0
+            and self.out_features % 8 == 0
+            and input.shape[:-1].numel() % 8 == 0
+            and self.chunk_size is not None
+            and self.chunk_size % 8 == 0
+            and self.reduction in ["mean", "sum"]
+        ):
+            return chunked_linear_cross_entropy(
+                input,
+                self.weight,
+                target,
+                chunk_size=self.chunk_size,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                tuned=self.tuned,
+            )
+        else:
+            return linear_cross_entropy_func(
+                input,
+                self.weight,
+                self.bias,
+                target,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                inplace_backward=self.inplace_backward,
+            )

build/torch-cuda/quack/mlp.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Copyright (c) 2025, Tri Dao
+from typing import Literal
+from functools import partial
+import torch
+import torch.nn as nn
+from torch import Tensor
+from einops import rearrange
+from .linear import linear_act_func, act_linear_func
+from .linear import linear_gated_func, gated_linear_func
+from .linear import linear_fwd_convert_type
+from .linear import _recompute_act_postact, _recompute_gated_postact
+from .activation import gate_fn_map
+from .gemm_interface import (
+    act_to_pytorch_fn_map,
+    gated_to_pytorch_fn_map,
+    gemm,
+    gemm_add_inplace,
+    gemm_gated,
+    gemm_dgated,
+    gemm_act,
+    gemm_dact,
+)
+Activation = Literal[
+    "gelu_tanh_approx",
+    "relu",
+    "relu_sq",
+    "swiglu",
+    "swiglu_oai",
+    "reglu",
+    "geglu",
+    "glu",
+]
+# --- Ops bundles for MLP recompute variants ---
+class _MLPOps:
+    matmul_fwd = gemm
+    matmul_fwd_act = gemm_act
+    matmul_bwd_dact = partial(gemm_dact, dynamic_scheduler=True)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True)
+    recompute_postact = staticmethod(_recompute_act_postact)
+class _MLPUntunedOps:
+    matmul_fwd = partial(gemm, tuned=False)
+    matmul_fwd_act = partial(gemm_act, tuned=False)
+    matmul_bwd_dact = partial(gemm_dact, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw = partial(gemm, dynamic_scheduler=True, tuned=False)
+    matmul_bwd_dw_inplace = partial(gemm_add_inplace, dynamic_scheduler=True, tuned=False)
+    recompute_postact = staticmethod(_recompute_act_postact)
+class _MLPGatedOps(_MLPOps):
+    matmul_fwd_act = gemm_gated
+    matmul_bwd_dact = partial(gemm_dgated, dynamic_scheduler=True)
+    recompute_postact = staticmethod(_recompute_gated_postact)
+class _MLPGatedUntunedOps(_MLPUntunedOps):
+    matmul_fwd_act = partial(gemm_gated, tuned=False)
+    matmul_bwd_dact = partial(gemm_dgated, dynamic_scheduler=True, tuned=False)
+    recompute_postact = staticmethod(_recompute_gated_postact)
+class _MLPGatedConcatOps(_MLPGatedOps):
+    matmul_fwd_act = partial(gemm_gated, concat_layout=("B",))
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, concat_layout=("B",))
+    matmul_bwd_dw1 = partial(gemm, dynamic_scheduler=True, concat_layout=("out",))
+    matmul_bwd_dw1_inplace = partial(
+        gemm_add_inplace, dynamic_scheduler=True, concat_layout=("C", "out")
+    )
+    recompute_fwd = partial(gemm, concat_layout=("B",))
+class _MLPGatedConcatUntunedOps(_MLPGatedUntunedOps):
+    matmul_fwd_act = partial(gemm_gated, tuned=False, concat_layout=("B",))
+    matmul_bwd_dx = partial(gemm, dynamic_scheduler=True, tuned=False, concat_layout=("B",))
+    matmul_bwd_dw1 = partial(gemm, dynamic_scheduler=True, tuned=False, concat_layout=("out",))
+    matmul_bwd_dw1_inplace = partial(
+        gemm_add_inplace, dynamic_scheduler=True, tuned=False, concat_layout=("out",)
+    )
+    recompute_fwd = partial(gemm, tuned=False, concat_layout=("B",))
+class MLPRecomputeFunc(torch.autograd.Function):
+    """MLP with activation recomputation: saves only x (not preact) to reduce memory.
+    In backward, recomputes preact = x @ W1.T (one extra matmul) instead of loading it
+    from saved tensors. This trades compute for memory:
+      - Saves: batch * 2 * hidden * dtype_size bytes of activation memory
+      - Costs: one extra GEMM (x @ W1.T) during backward
+    Ops class selects between non-gated (gemm_act/gemm_dact) and gated (gemm_gated/gemm_dgated)
+    variants, as well as tuned/untuned.
+    """
+    @staticmethod
+    def forward(ctx, x, weight1, weight2, activation, fuse_grad_accum, ops):
+        x, weight1, weight2 = linear_fwd_convert_type(x, weight1, weight2)
+        with torch.amp.autocast("cuda", enabled=False):
+            ctx.weight_dtype = weight1.dtype
+            ctx.fuse_grad_accum = fuse_grad_accum
+            ctx.activation = activation
+            ctx.ops = ops
+            weight1_og, weight2_og = weight1, weight2
+            batch_shape = x.shape[:-1]
+            x_flat = x.reshape(-1, x.shape[-1])
+            _preact, postact = ops.matmul_fwd_act(x_flat, weight1.T, activation=activation)
+            out = ops.matmul_fwd(postact, weight2.T)
+            # Save only x and weights — no preact (the whole point of recompute)
+            needs_input_grad = ctx.needs_input_grad
+            any_grad = needs_input_grad[0] or needs_input_grad[1] or needs_input_grad[2]
+            need_dact = needs_input_grad[0] or needs_input_grad[1]  # gemm_dact for dpreact
+            saved_x = x if any_grad else None  # recompute preact = x @ W1.T
+            saved_w1 = weight1 if any_grad else None  # recompute + dx
+            saved_w2 = weight2 if need_dact else None  # only gemm_dact needs W2
+            ctx.save_for_backward(
+                saved_x,
+                saved_w1,
+                saved_w2,
+                weight1_og if fuse_grad_accum else None,
+                weight2_og if fuse_grad_accum else None,
+            )
+            return out.reshape(*batch_shape, out.shape[-1])
+    @staticmethod
+    def backward(ctx, dout):
+        with torch.amp.autocast("cuda", enabled=False):
+            ops = ctx.ops
+            x, weight1, weight2, weight1_og, weight2_og = ctx.saved_tensors
+            batch_shape = dout.shape[:-1]
+            dout = dout.reshape(-1, dout.shape[-1]).contiguous()
+            # Recompute preact = x @ W1.T (the extra matmul we trade for memory)
+            x_flat = x.reshape(-1, x.shape[-1]) if x is not None else None
+            need_dact = ctx.needs_input_grad[0] or ctx.needs_input_grad[1]
+            any_grad = need_dact or ctx.needs_input_grad[2]
+            # concat ops override recompute_fwd to produce interleaved preact matching forward
+            recompute_fwd = getattr(ops, "recompute_fwd", ops.matmul_fwd)
+            if need_dact:
+                preact = recompute_fwd(x_flat, weight1.T)
+                # gemm_dact computes: dpreact = d_act(dout @ W2, preact) AND recomputes postact
+                dpreact, postact = ops.matmul_bwd_dact(
+                    dout, weight2, preact, activation=ctx.activation
+                )
+            elif any_grad:
+                # Only dW2 needed: recompute postact from preact cheaply (no gemm_dact)
+                preact = recompute_fwd(x_flat, weight1.T)
+                postact = ops.recompute_postact(preact, ctx.activation)
+                dpreact = None
+            else:
+                dpreact, postact = None, None
+            # dW2 = dout.T @ postact
+            dweight2 = _compute_weight_grad(
+                ctx,
+                dout,
+                postact,
+                weight2_og,
+                ops.matmul_bwd_dw,
+                ops.matmul_bwd_dw_inplace,
+                ctx.needs_input_grad[2],
+            )
+            # dx = dpreact @ W1
+            if ctx.needs_input_grad[0]:
+                dx = ops.matmul_bwd_dx(dpreact, weight1)
+                dx = dx.reshape(*batch_shape, dx.shape[-1])
+            else:
+                dx = None
+            # dW1 = dpreact.T @ x (use dw1 ops if available, e.g. concat layout)
+            dw1_fn = getattr(ops, "matmul_bwd_dw1", ops.matmul_bwd_dw)
+            dw1_inplace_fn = getattr(ops, "matmul_bwd_dw1_inplace", ops.matmul_bwd_dw_inplace)
+            dweight1 = _compute_weight_grad(
+                ctx,
+                dpreact,
+                x_flat,
+                weight1_og,
+                dw1_fn,
+                dw1_inplace_fn,
+                ctx.needs_input_grad[1],
+            )
+            return dx, dweight1, dweight2, None, None, None
+def _compute_weight_grad(ctx, dout, x, weight_og, matmul_fn, matmul_inplace_fn, needs_grad):
+    if not needs_grad:
+        return None
+    x = x.reshape(-1, x.shape[-1])
+    if not ctx.fuse_grad_accum or weight_og.grad is None or torch.compiler.is_compiling():
+        return matmul_fn(dout.T, x, out_dtype=ctx.weight_dtype)
+    else:
+        matmul_inplace_fn(dout.T, x, weight_og.grad)
+        dweight = weight_og.grad
+        weight_og.grad = None
+        return dweight
+def mlp_func(
+    x,
+    weight1,
+    weight2,
+    activation: str,
+    bias1=None,
+    bias2=None,
+    fuse_grad_accum=False,
+    tuned=True,
+    recompute=False,
+    concat_layout=False,
+):
+    gated = activation in gate_fn_map
+    if concat_layout:
+        assert gated, "concat_layout is only supported for gated MLP"
+    if recompute:
+        if concat_layout:
+            ops = _MLPGatedConcatOps if tuned else _MLPGatedConcatUntunedOps
+        elif gated:
+            ops = _MLPGatedOps if tuned else _MLPGatedUntunedOps
+        else:
+            ops = _MLPOps if tuned else _MLPUntunedOps
+        return MLPRecomputeFunc.apply(x, weight1, weight2, activation, fuse_grad_accum, ops)
+    fc1_fn = linear_gated_func if gated else linear_act_func
+    fc2_fn = gated_linear_func if gated else act_linear_func
+    preact, postact = fc1_fn(
+        x,
+        weight1,
+        activation,
+        bias=bias1,
+        store_preact=torch.is_grad_enabled(),
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+        **({"concat_layout": concat_layout} if concat_layout and gated else {}),
+    )
+    out = fc2_fn(
+        preact,
+        weight2,
+        postact,
+        activation=activation,
+        bias=bias2,
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+    )
+    return out
+class MLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=False,
+        bias2=False,
+        activation: Activation = "gelu_tanh_approx",
+        multiple_of=1,
+        device=None,
+        dtype=None,
+        fuse_grad_accum: bool = False,
+        tuned: bool = True,
+        recompute: bool = False,
+        concat_layout: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        self.activation = activation
+        self.gated = activation in gate_fn_map
+        assert not concat_layout or self.gated, "concat_layout is only supported for gated MLP"
+        if hidden_features is None:
+            hidden_features = int(8 / 3 * in_features) if self.gated else 4 * in_features
+        if multiple_of > 1:
+            hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
+        fc1_out = 2 * hidden_features if self.gated else hidden_features
+        self.fc1 = nn.Linear(in_features, fc1_out, bias=bias1, **factory_kwargs)
+        if self.gated:
+            if concat_layout:
+                self.fc1.weight._muon_reshape_functions = (
+                    lambda w: rearrange(w, "(two d) e -> two d e", two=2),
+                    lambda w: rearrange(w, "two d e -> (two d) e"),
+                )
+            else:
+                self.fc1.weight._muon_reshape_functions = (
+                    lambda w: rearrange(w, "(d two) e -> two d e", two=2),
+                    lambda w: rearrange(w, "two d e -> (d two) e"),
+                )
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+        self.fuse_grad_accum = fuse_grad_accum
+        self.tuned = tuned
+        self.recompute = recompute
+        self.concat_layout = concat_layout
+    def forward(self, input: Tensor) -> Tensor:
+        # Allow bias in the fused path during inference (fwd-only, no bwd).
+        bias_ok = not torch.is_grad_enabled() or (self.fc1.bias is None and self.fc2.bias is None)
+        if (
+            bias_ok
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.fc1.in_features % 8 == 0
+            and self.fc1.out_features % (16 if self.gated else 8) == 0
+            and self.fc2.out_features % 8 == 0
+        ):
+            return mlp_func(
+                input,
+                self.fc1.weight,
+                self.fc2.weight,
+                activation=self.activation,
+                bias1=self.fc1.bias,
+                bias2=self.fc2.bias,
+                fuse_grad_accum=self.fuse_grad_accum,
+                tuned=self.tuned,
+                recompute=self.recompute,
+                concat_layout=self.concat_layout,
+            )
+        else:
+            y = self.fc1(input)
+            if self.gated:
+                if self.concat_layout:
+                    gate, up = y.chunk(2, dim=-1)
+                    y = gated_to_pytorch_fn_map[self.activation](gate, up)
+                else:
+                    y = gated_to_pytorch_fn_map[self.activation](y[..., ::2], y[..., 1::2])
+            else:
+                y = act_to_pytorch_fn_map[self.activation](y)
+            return self.fc2(y)

build/torch-cuda/quack/mx_utils.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Minimal MX / NVFP4 quantization + scale swizzling utilities.
+Ported from torchao (BSD-3) to avoid the runtime dependency:
+  torchao/prototype/mx_formats/{mx_tensor, nvfp4_tensor, utils, constants}.py
+  torchao/prototype/custom_fp_utils.py
+  torchao/prototype/mx_formats/kernels.py
+All quantizers are pure-PyTorch. Use the `to_mx_compiled` / `to_mxfp4_compiled` /
+`to_nvfp4_compiled` module-level handles if you want torch.compile-generated
+Triton kernels (much faster on big tensors; one-time compile overhead).
+Only the FLOOR scaling mode is ported (torchao's default for MX formats).
+"""
+import torch
+F8E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max  # 448.0
+F8E4M3_MAX_POW2 = 8
+E8M0_EXPONENT_BIAS = 127
+E8M0_EXPONENT_NAN_VAL = 255
+F32_EXP_BIAS = 127
+F32_MIN_NORMAL = 2 ** (-F32_EXP_BIAS + 1)  # 2**-126
+MBITS_F32 = 23
+EBITS_F32 = 8
+# FP4 E2M1 constants
+F4_E2M1_MAX = 6.0
+F4_E2M1_MAX_POW2 = 2
+F4_E2M1_MAX_INT = 7  # 3-bit magnitude mask
+EBITS_F4_E2M1, MBITS_F4_E2M1 = 2, 1
+E4M3_EPS = torch.finfo(torch.float8_e4m3fn).tiny
+def _n_ones(n: int) -> int:
+    return (1 << n) - 1
+def to_mx(data_hp: torch.Tensor, block_size: int = 32):
+    """MXFP8-e4m3 quantization with FLOOR scaling.
+    Args:
+        data_hp: (..., K) bf16 or fp32 tensor, contiguous, K % block_size == 0.
+    Returns:
+        qdata: (..., K) float8_e4m3fn
+        scale: (..., K // block_size) float8_e8m0fnu
+    """
+    assert data_hp.dtype in (torch.bfloat16, torch.float32)
+    assert data_hp.shape[-1] % block_size == 0
+    assert data_hp.is_contiguous()
+    orig_shape = data_hp.shape
+    data_hp = data_hp.reshape(*orig_shape[:-1], orig_shape[-1] // block_size, block_size)
+    max_abs = torch.amax(torch.abs(data_hp), -1).unsqueeze(-1)
+    data_hp = data_hp.to(torch.float32)
+    max_abs = max_abs.to(torch.float32)
+    # FLOOR scaling: extract biased exponent of max_abs via bit-shift
+    max_abs_int32 = max_abs.view(torch.int32)
+    extracted_pow2 = ((torch.bitwise_right_shift(max_abs_int32, MBITS_F32)) & 0xFF) - F32_EXP_BIAS
+    scale_e8m0_unbiased = extracted_pow2 - F8E4M3_MAX_POW2
+    scale_e8m0_unbiased = torch.clamp(
+        scale_e8m0_unbiased, min=-E8M0_EXPONENT_BIAS, max=E8M0_EXPONENT_BIAS + 1
+    )
+    scale_e8m0_biased = (scale_e8m0_unbiased + E8M0_EXPONENT_BIAS).to(torch.uint8)
+    # restore NaN sentinel (uint8 cast drops NaN)
+    scale_e8m0_biased = torch.where(torch.isnan(max_abs), E8M0_EXPONENT_NAN_VAL, scale_e8m0_biased)
+    # reconstruct fp32 scale from biased exponent
+    scale_fp32 = (torch.bitwise_left_shift(scale_e8m0_biased.to(torch.int32), MBITS_F32)).view(
+        torch.float32
+    )
+    # avoid 2**-127 being flushed to 0 (pytorch #125557)
+    scale_fp32 = torch.clamp(scale_fp32, min=F32_MIN_NORMAL)
+    data_lp = data_hp / scale_fp32
+    # eager fp8 cast is unsaturated; clamp explicitly
+    if not torch._dynamo.is_compiling():
+        data_lp = torch.clamp(data_lp, min=-F8E4M3_MAX, max=F8E4M3_MAX)
+    qdata = data_lp.to(torch.float8_e4m3fn).reshape(orig_shape)
+    scale = scale_e8m0_biased.view(torch.float8_e8m0fnu).squeeze(-1)
+    return qdata, scale
+def _f32_to_floatx_unpacked(x: torch.Tensor, ebits: int, mbits: int) -> torch.Tensor:
+    """FP32 -> sub-byte float (uint8, code in low bits). Verbatim from torchao.
+    Round-to-nearest-even via magic-adder; saturation on overflow; no NaN.
+    """
+    assert x.dtype == torch.float
+    assert 1 + ebits + mbits <= 8
+    exp_bias = _n_ones(ebits - 1)
+    max_int = _n_ones(ebits + mbits)
+    sign_mask = 1 << (ebits + mbits)
+    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
+    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits))
+    min_normal = 2 ** (1 - exp_bias)
+    denorm_exp = (F32_EXP_BIAS - exp_bias) + (MBITS_F32 - mbits) + 1
+    denorm_mask_int = denorm_exp << MBITS_F32
+    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(torch.float32)
+    x = x.view(torch.int32)
+    sign = x & 0x80000000
+    x = x ^ sign
+    x = x.view(torch.float)
+    saturate_mask = x >= max_normal
+    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
+    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
+    denormal_x = x + denorm_mask_float
+    denormal_x = denormal_x.view(torch.int32)
+    denormal_x -= denorm_mask_int
+    denormal_x = denormal_x.to(torch.uint8)
+    normal_x = x.view(torch.int32)
+    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
+    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
+    normal_x += val_to_add
+    normal_x += mant_odd
+    normal_x = normal_x >> (MBITS_F32 - mbits)
+    normal_x = normal_x.to(torch.uint8)
+    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    x = torch.where(denormal_mask, denormal_x, x)
+    x = torch.where(normal_mask, normal_x, x)
+    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
+    sign_lp = sign_lp.to(torch.uint8)
+    sign_lp = sign_lp & sign_mask
+    x = x | sign_lp
+    return x.to(torch.uint8)
+def _pack_uint4(uint8_data: torch.Tensor) -> torch.Tensor:
+    """Pack 4-bit uint8 values in pairs: pair (a,b) -> byte (b<<4 | a)."""
+    shape = uint8_data.shape
+    assert shape[-1] % 2 == 0
+    uint8_data = uint8_data.contiguous().view(-1)
+    return (uint8_data[::2] | uint8_data[1::2] << 4).view(*shape[:-1], shape[-1] // 2)
+def _compute_e8m0_scale_floor(max_abs: torch.Tensor, target_max_pow2: int) -> torch.Tensor:
+    """Return biased E8M0 scale (uint8) for FLOOR-mode MX quantization."""
+    max_abs_int32 = max_abs.view(torch.int32)
+    extracted_pow2 = ((torch.bitwise_right_shift(max_abs_int32, MBITS_F32)) & 0xFF) - F32_EXP_BIAS
+    scale_unbiased = extracted_pow2 - target_max_pow2
+    scale_unbiased = torch.clamp(
+        scale_unbiased, min=-E8M0_EXPONENT_BIAS, max=E8M0_EXPONENT_BIAS + 1
+    )
+    scale_biased = (scale_unbiased + E8M0_EXPONENT_BIAS).to(torch.uint8)
+    scale_biased = torch.where(torch.isnan(max_abs), E8M0_EXPONENT_NAN_VAL, scale_biased)
+    return scale_biased
+def to_mxfp4(x: torch.Tensor, block_size: int = 32):
+    """MXFP4 quantization: E2M1 data + E8M0 per-block scales, FLOOR scaling.
+    Args:
+        x: (..., K) bf16/fp16/fp32, contiguous, K % block_size == 0.
+    Returns:
+        qdata_packed: uint8, shape (..., K // 2). Two FP4 values per byte
+                      (first -> low nibble, second -> high nibble).
+        scale: float8_e8m0fnu, shape (..., K // block_size).
+    """
+    assert x.dtype in (torch.bfloat16, torch.float16, torch.float32)
+    assert x.shape[-1] % block_size == 0
+    assert x.is_contiguous()
+    orig_shape = x.shape
+    data_hp = x.reshape(*orig_shape[:-1], orig_shape[-1] // block_size, block_size)
+    max_abs = torch.amax(torch.abs(data_hp), -1).unsqueeze(-1)
+    data_hp = data_hp.to(torch.float32)
+    max_abs = max_abs.to(torch.float32)
+    scale_biased = _compute_e8m0_scale_floor(max_abs, F4_E2M1_MAX_POW2)
+    scale_fp32 = (torch.bitwise_left_shift(scale_biased.to(torch.int32), MBITS_F32)).view(
+        torch.float32
+    )
+    scale_fp32 = torch.clamp(scale_fp32, min=F32_MIN_NORMAL)
+    data_lp = data_hp / scale_fp32
+    data_lp = data_lp.reshape(orig_shape)
+    data_lp = _f32_to_floatx_unpacked(data_lp.float(), EBITS_F4_E2M1, MBITS_F4_E2M1)
+    data_lp = _pack_uint4(data_lp)
+    scale = scale_biased.view(torch.float8_e8m0fnu).squeeze(-1)
+    return data_lp, scale
+def nvfp4_per_tensor_scale(amax: torch.Tensor) -> torch.Tensor:
+    """NVFP4 per-tensor scale: amax / (F8E4M3_MAX * F4_E2M1_MAX) = amax / 2688."""
+    return amax.to(torch.float32) / (F8E4M3_MAX * F4_E2M1_MAX)
+def to_nvfp4(x: torch.Tensor, block_size: int = 16, per_tensor_scale=None):
+    """NVFP4 quantization: E2M1 data + E4M3 per-block scales + optional fp32 per-tensor scale.
+    Args:
+        x: (..., K) bf16/fp32, contiguous, K % 16 == 0.
+        block_size: must be 16.
+        per_tensor_scale: scalar fp32 tensor, or None (uses 1.0 / returns unit).
+    Returns:
+        qdata_packed: uint8, shape (..., K // 2)
+        scale: float8_e4m3fn, shape (..., K // 16)
+        per_tensor_scale: scalar fp32 tensor (1.0 if None was passed)
+    """
+    assert x.dtype in (torch.bfloat16, torch.float32)
+    assert x.shape[-1] % block_size == 0
+    assert x.is_contiguous()
+    assert block_size == 16, "NVFP4 requires block_size=16"
+    orig_shape = x.shape
+    data_hp = x.float().reshape(*orig_shape[:-1], orig_shape[-1] // block_size, block_size)
+    max_abs = torch.amax(torch.abs(data_hp), dim=-1)
+    block_scale = max_abs / F4_E2M1_MAX
+    if per_tensor_scale is None:
+        block_scale_fp8 = torch.clamp(block_scale, min=E4M3_EPS, max=F8E4M3_MAX).to(
+            torch.float8_e4m3fn
+        )
+        recip = 1.0 / block_scale_fp8.to(torch.float32)
+        returned_pts = torch.tensor(1.0, dtype=torch.float32, device=x.device)
+    else:
+        scaled = block_scale.to(torch.float32) / per_tensor_scale
+        block_scale_fp8 = torch.clamp(scaled, min=E4M3_EPS, max=F8E4M3_MAX).to(torch.float8_e4m3fn)
+        recip = (1.0 / per_tensor_scale) / block_scale_fp8.to(torch.float32)
+        returned_pts = per_tensor_scale.to(torch.float32)
+    data_scaled = data_hp * recip.unsqueeze(-1)
+    data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
+    data_scaled = data_scaled.view(orig_shape)
+    data_lp = _f32_to_floatx_unpacked(data_scaled.float(), EBITS_F4_E2M1, MBITS_F4_E2M1)
+    data_lp = _pack_uint4(data_lp)
+    return data_lp, block_scale_fp8, returned_pts
+# ---------------------------------------------------------------------------
+# torch.compile-wrapped fast paths. Generates fused Triton quant kernels via
+# Inductor. dynamic=True avoids recompilation on shape changes.
+# ---------------------------------------------------------------------------
+to_mx_compiled = torch.compile(to_mx, dynamic=True)
+to_mxfp4_compiled = torch.compile(to_mxfp4, dynamic=True)
+to_nvfp4_compiled = torch.compile(to_nvfp4, dynamic=True)
+def _ceil_div(a, b):
+    return (a + b - 1) // b
+def to_blocked(input_matrix: torch.Tensor) -> torch.Tensor:
+    """Swizzle a (H, W) e8m0 scale tensor into the 128x4 blocked layout
+    cuBLAS expects for MXFP8 _scaled_mm. Returns a 1-D flat tensor of size
+    32*ceil(H/128) * 16*ceil(W/4)."""
+    rows, cols = input_matrix.shape
+    n_row_blocks = _ceil_div(rows, 128)
+    n_col_blocks = _ceil_div(cols, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+    padded = input_matrix
+    if torch.compiler.is_compiling() or (rows, cols) != (padded_rows, padded_cols):
+        padded = torch.zeros(
+            (padded_rows, padded_cols),
+            device=input_matrix.device,
+            dtype=input_matrix.dtype,
+        )
+        padded[:rows, :cols] = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+    return rearranged.flatten()

build/torch-cuda/quack/nvmmh_heuristic.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# Copyright (c) 2025, Tri Dao.
+"""nvMatmulHeuristics-based config selection for GEMM.
+Queries NVIDIA's analytic heuristic library to pick tile/cluster dims based on
+problem shape, then selects swap_ab by comparing estimated runtimes for both
+orientations.
+"""
+import logging
+import torch
+from .gemm_config import GemmConfig
+logger = logging.getLogger(__name__)
+_nvmmh_available = None
+_iface = None
+_hw_descriptors = {}  # gpu_enum -> hw descriptor
+def _get_iface():
+    """Lazily initialize the nvMatmulHeuristics interface."""
+    global _nvmmh_available, _iface
+    if _nvmmh_available is not None:
+        return _iface
+    try:
+        from nvMatmulHeuristics import (
+            NvMatmulHeuristicsInterface,
+            NvMatmulHeuristicsTarget,
+        )
+        _iface = NvMatmulHeuristicsInterface(
+            backend=NvMatmulHeuristicsTarget.CUTLASS3,
+            precision="BSB",  # overridden per-call
+        )
+        _nvmmh_available = True
+    except Exception as e:
+        logger.debug(f"nvMatmulHeuristics not available: {e}")
+        _nvmmh_available = False
+        _iface = None
+    return _iface
+def _get_hw(device_capacity):
+    """Get or create a hardware descriptor for the given SM version."""
+    global _hw_descriptors
+    if device_capacity in _hw_descriptors:
+        return _hw_descriptors[device_capacity]
+    try:
+        from nvMatmulHeuristics import (
+            NvMatmulHeuristicsNvidiaGpu,
+            NvMatmulHeuristicsMatmulLayout,
+        )
+        iface = _get_iface()
+        if iface is None:
+            return None
+        gpu_map = {
+            9: NvMatmulHeuristicsNvidiaGpu.H100_SXM,
+            10: NvMatmulHeuristicsNvidiaGpu.B200,
+        }
+        gpu = gpu_map.get(device_capacity)
+        if gpu is None:
+            return None
+        hw = iface.createHardwareDescriptor()
+        iface.setHardwarePredefinedGpu(hw, gpu)
+        # Load discovery sets for TN_ROW_MAJOR and TN_COL_MAJOR
+        for layout in [
+            NvMatmulHeuristicsMatmulLayout.TN_ROW_MAJOR,
+            NvMatmulHeuristicsMatmulLayout.TN_COL_MAJOR,
+        ]:
+            iface.loadInternalDiscoverySet(layout, hw)
+        _hw_descriptors[device_capacity] = hw
+        return hw
+    except Exception as e:
+        logger.debug(f"Failed to create hardware descriptor: {e}")
+        _hw_descriptors[device_capacity] = None
+        return None
+_TORCH_DTYPE_TO_NVMMH_PRECISION = {
+    torch.bfloat16: "BSB",
+    torch.float16: "HSH",
+    torch.float32: "SSS",
+}
+def _query_top1(iface, hw, m, n, k, layout, precision):
+    """Query nvMMH for top-1 config. Returns (tile_m, tile_n, cl_m, cl_n, est_runtime) or None."""
+    try:
+        original_precision = iface.precision
+        iface.precision = precision
+        results = iface.get_with_mnk(
+            m=m,
+            n=n,
+            k=k,
+            matmulLayout=layout,
+            count=1,
+            hardware_descriptor=hw,
+        )
+        iface.precision = original_precision
+        if not results:
+            return None
+        cfg = results[0]["kernel"]
+        return cfg.cta_tile_m, cfg.cta_tile_n, cfg.cluster_m, cfg.cluster_n, results[0]["runtime"]
+    except Exception:
+        return None
+def nvmmh_default_config(A, B, device_capacity):
+    """Use nvMatmulHeuristics to pick a GemmConfig based on problem shape.
+    Queries both normal (M,N,K) with row-major output and swapped (N,M,K) with
+    col-major output, picks the orientation with lower estimated runtime.
+    Returns None if nvMatmulHeuristics is unavailable, letting the caller fall
+    back to the hardcoded default.
+    """
+    from nvMatmulHeuristics import NvMatmulHeuristicsMatmulLayout
+    iface = _get_iface()
+    if iface is None:
+        return None
+    hw = _get_hw(device_capacity)
+    if hw is None:
+        return None
+    precision = _TORCH_DTYPE_TO_NVMMH_PRECISION.get(A.dtype)
+    if precision is None:
+        return None
+    # Extract M, N, K from tensor shapes
+    # A: (M, K) or (L, M, K), B: (K, N) or (L, K, N)
+    m = A.shape[-2] if A.ndim >= 2 else A.shape[0]
+    k = A.shape[-1]
+    n = B.shape[-1]
+    # Query normal orientation: D(M,N) row-major
+    normal = _query_top1(iface, hw, m, n, k, NvMatmulHeuristicsMatmulLayout.TN_ROW_MAJOR, precision)
+    # Query swapped orientation: D(N,M) col-major
+    swapped = _query_top1(
+        iface, hw, n, m, k, NvMatmulHeuristicsMatmulLayout.TN_COL_MAJOR, precision
+    )
+    if normal is None and swapped is None:
+        return None
+    # Pick orientation with lower estimated runtime
+    normal_rt = normal[4] if normal else float("inf")
+    swapped_rt = swapped[4] if swapped else float("inf")
+    if swapped_rt < normal_rt and swapped is not None:
+        tile_m, tile_n, cl_m, cl_n = swapped[:4]
+        swap_ab = True
+    else:
+        tile_m, tile_n, cl_m, cl_n = normal[:4]
+        swap_ab = False
+    # SM90: pingpong only works with tile_m <= 128
+    # SM100: no pingpong
+    pingpong = (device_capacity == 9) and (tile_m <= 128)
+    return GemmConfig(
+        tile_m=tile_m,
+        tile_n=tile_n,
+        pingpong=pingpong,
+        cluster_m=cl_m,
+        cluster_n=cl_n,
+        swap_ab=swap_ab,
+        max_swizzle_size=8,
+        device_capacity=device_capacity,
+    )

build/torch-cuda/quack/pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2025, Tri Dao.
 from typing import Optional
 from dataclasses import dataclass
@@ -6,9 +6,51 @@ from dataclasses import dataclass
 import cutlass.cute as cute
 from cutlass import Boolean, Int32, const_expr
 from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
-from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
-from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
-from cutlass.pipeline import PipelineTmaUmma
 class PipelineStateWAdvance(PipelineState):
@@ -33,99 +75,236 @@ def make_pipeline_state(type: PipelineUserType, stages: int):
     Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
     """
     if type is PipelineUserType.Producer:
-        return PipelineStateWAdvance(
-            stages,
-            Int32(0),
-            Int32(0),
-            Int32(1),
-        )
     elif type is PipelineUserType.Consumer:
-        return PipelineStateWAdvance(
-            stages,
-            Int32(0),
-            Int32(0),
-            Int32(0),
-        )
     else:
         assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
 @dataclass(frozen=True)
-class PipelineTmaCpAsync(PipelineTmaAsync):
     """
-    PipelineTmaCpAsync is used for CpAsync + TMA producers and AsyncThread consumers
     """
     @staticmethod
     def create(
-        *,
-        num_stages: int,
-        producer_group: CooperativeGroup,
-        consumer_group: CooperativeGroup,
-        tx_count: int,
-        barrier_storage: cute.Pointer = None,
-        cta_layout_vmnk: Optional[cute.Layout] = None,
-        tidx: Optional[Int32] = None,
     ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
-        :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: CooperativeGroup for the producer agent
-        :type producer_group: CooperativeGroup
-        :param consumer_group: CooperativeGroup for the consumer agent
-        :type consumer_group: CooperativeGroup
-        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
-        :type tx_count: int
-        :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
-        :param tidx: thread index to consumer async threads
-        :type tidx: Int32 | None
-        """
-        if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
-                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
-            )
-        producer_type = PipelineOp.TmaLoad
-        consumer_type = PipelineOp.AsyncThread
-        producer = (producer_type, producer_group)
-        consumer = (consumer_type, consumer_group)
-        sync_object_full = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8), num_stages, producer, tx_count
         )
-        sync_object_empty = PipelineAsync._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
         )
-        if tidx is None:
-            tidx, _, _ = cute.arch.thread_idx()
-        if cta_layout_vmnk is None:
-            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
-        (
-            dst_rank,
-            is_signalling_thread,
-        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
-            dst_rank = None
         else:
-            dst_rank = dst_rank
-        producer_mask = None
-        pipeline_init_wait(cta_layout_vmnk)
-        return PipelineTmaCpAsync(
-            sync_object_full,
-            sync_object_empty,
-            num_stages,
-            producer_mask,
-            dst_rank,
-            is_signalling_thread,
-        )
     @dsl_user_op
     def producer_acquire(
@@ -133,30 +312,115 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
         *,
         loc=None,
         ip=None,
     ):
         """
-        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             is_tma_warp,
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
         )
     @dsl_user_op
     def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
-        """
-        We need the mbarrier to track the completion of cp.async
-        """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)
 class MbarrierArrayWDropCount(MbarrierArray):
@@ -204,13 +468,17 @@ class MbarrierArrayWDropCount(MbarrierArray):
         )
 @dataclass(frozen=True)
-class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
     """
     PipelineTmaCpAsync is used for CpAsync + TMA producers and UMMA consumers
     (e.g. Blackwell mainloops)
     """
     @staticmethod
     def create(
         *,
@@ -220,28 +488,34 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         tx_count: int,
         barrier_storage: cute.Pointer = None,
         cta_layout_vmnk: Optional[cute.Layout] = None,
-        producer_drop_count: Optional[Int32] = None,
         mcast_mode_mn: tuple[int, int] = (1, 1),
     ):
-        """
-        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
-        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
-        :type barrier_storage: cute.Pointer
         :param num_stages: Number of buffer stages for this pipeline
-        :type num_stages: Int32
-        :param producer_group: `CooperativeGroup` for the producer agent
         :type producer_group: CooperativeGroup
-        :param consumer_group: `CooperativeGroup` for the consumer agent
         :type consumer_group: CooperativeGroup
         :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
         :type tx_count: int
         :param cta_layout_vmnk: Layout of the cluster shape
-        :type cta_layout_vmnk: cute.Layout | None
         :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
         :type mcast_mode_mn: tuple[int, int], optional
         """
         if not isinstance(barrier_storage, cute.Pointer):
-            raise ValueError(
                 f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
             )
@@ -257,29 +531,44 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
             producer,
             tx_count,
             drop_count=producer_drop_count,
         )
-        sync_object_empty = PipelineTmaUmma._make_sync_object(
-            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
         )
-        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
             # No mcast mask if not using clusters
             producer_mask = None
             # All threadblocks are leaders if not using clusters
             is_leader_cta = True
         else:
-            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk, mcast_mode_mn)
-            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
         cta_group = (
             cute.nvgpu.tcgen05.CtaGroup.ONE
-            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
             else cute.nvgpu.tcgen05.CtaGroup.TWO
         )
         consumer_mask = producer_mask
-        pipeline_init_wait(cta_layout_vmnk)
         return PipelineTmaCpAsyncUmma(
             sync_object_full,
@@ -308,12 +597,16 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             and_(self.is_leader_cta, is_tma_warp),
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
         )
     @dsl_user_op
@@ -321,4 +614,6 @@ class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)

+# Copyright (c) 2025-2026, Tri Dao.
 from typing import Optional
 from dataclasses import dataclass
 import cutlass.cute as cute
 from cutlass import Boolean, Int32, const_expr
 from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp
+from cutlass.pipeline import PipelineState, PipelineUserType
+from cutlass.pipeline import Agent, agent_sync
+from cutlass.pipeline import NamedBarrier as NamedBarrierOg
+from cutlass.pipeline import PipelineAsync as PipelineAsyncOg
+from cutlass.pipeline import PipelineCpAsync as PipelineCpAsyncOg
+from cutlass.pipeline import PipelineTmaAsync as PipelineTmaAsyncOg
+from cutlass.pipeline import PipelineTmaUmma as PipelineTmaUmmaOg
+from cutlass.pipeline import PipelineUmmaAsync as PipelineUmmaAsyncOg
+from cutlass.pipeline import PipelineAsyncUmma as PipelineAsyncUmmaOg
+# ── Shared helpers ───────────────────────────────────────────────────────────
+def _override_create(parent_cls, child_cls):
+    """Create a static factory that constructs parent_cls then re-classes to child_cls."""
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = parent_cls.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        object.__setattr__(obj, "__class__", child_cls)
+        return obj
+    return create
+def _make_state(index: Int32, phase: Int32) -> PipelineState:
+    """Construct a PipelineState from index and phase (count/stages unused by callers)."""
+    return PipelineState(stages=0, count=Int32(0), index=index, phase=phase)
+def _call_with_elect_one(parent_method, self, state, elect_one, syncwarp, loc, ip):
+    """Optionally wrap a parent pipeline method call in sync_warp + elect_one."""
+    if const_expr(elect_one):
+        if const_expr(syncwarp):
+            cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            parent_method(self, state, loc=loc, ip=ip)
+    else:
+        parent_method(self, state, loc=loc, ip=ip)
+# ── Pipeline state ──────────────────────────────────────────────────────────
 class PipelineStateWAdvance(PipelineState):
     Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
     """
     if type is PipelineUserType.Producer:
+        return PipelineStateWAdvance(stages, Int32(0), Int32(0), Int32(1))
     elif type is PipelineUserType.Consumer:
+        return PipelineStateWAdvance(stages, Int32(0), Int32(0), Int32(0))
     else:
         assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+# ── Mixin: _w_index / _w_index_phase variants ───────────────────────────────
+class _PipelineIndexPhaseMixin:
+    """Mixin providing _w_index_phase / _w_index methods that delegate to PipelineState-based parents."""
+    @dsl_user_op
+    def producer_acquire_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_acquire_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        state = _make_state(index, phase)
+        self.producer_acquire(state, try_acquire_token, loc=loc, ip=ip)
+    @dsl_user_op
+    def producer_commit_w_index(self, index: Int32, *, loc=None, ip=None):
+        state = _make_state(index, Int32(0))
+        self.producer_commit(state, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_wait_w_index_phase(
+        self,
+        index: Int32,
+        phase: Int32,
+        try_wait_token: Optional[Boolean] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        state = _make_state(index, phase)
+        self.consumer_wait(state, try_wait_token, loc=loc, ip=ip)
+    @dsl_user_op
+    def consumer_release_w_index(self, index: Int32, *, loc=None, ip=None):
+        state = _make_state(index, Int32(0))
+        self.consumer_release(state, loc=loc, ip=ip)
+# ── NamedBarrier ─────────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class NamedBarrier(NamedBarrierOg):
+    create = _override_create(NamedBarrierOg, None)  # patched below
+    @dsl_user_op
+    def arrive_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
+        """
+        The aligned flavor of arrive is used when all threads in the CTA will execute the
+        same instruction. See PTX documentation.
+        """
+        cute.arch.barrier_arrive(
+            barrier_id=self.barrier_id + index,
+            number_of_threads=self.num_threads,
+            loc=loc,
+            ip=ip,
+        )
+    @dsl_user_op
+    def arrive_and_wait_w_index(self, index: Int32, *, loc=None, ip=None) -> None:
+        cute.arch.barrier(
+            barrier_id=self.barrier_id + index,
+            number_of_threads=self.num_threads,
+            loc=loc,
+            ip=ip,
+        )
+NamedBarrier.create = _override_create(NamedBarrierOg, NamedBarrier)
+# ── PipelineAsync ────────────────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineAsync(_PipelineIndexPhaseMixin, PipelineAsyncOg):
     """
+    PipelineAsync with optional elect_one for producer_commit and consumer_release.
+    When elect_one_*=True (set at create time), only one elected thread per warp
+    signals the barrier arrive. This is useful when the mask count is set to 1 per warp.
+    Args (to create):
+        elect_one_commit: If True, only elected thread signals producer_commit.
+        syncwarp_before_commit: If True (default), issue syncwarp before elect_one.
+        elect_one_release: If True, only elected thread signals consumer_release.
+        syncwarp_before_release: If True (default), issue syncwarp before elect_one.
+            Set syncwarp to False when threads are already converged (e.g. after wgmma wait_group).
     """
+    _elect_one_commit: bool = False
+    _syncwarp_before_commit: bool = True
+    _elect_one_release: bool = False
+    _syncwarp_before_release: bool = True
     @staticmethod
     def create(
+        *args,
+        elect_one_commit: bool = False,
+        syncwarp_before_commit: bool = True,
+        elect_one_release: bool = False,
+        syncwarp_before_release: bool = True,
+        **kwargs,
     ):
+        obj = PipelineAsyncOg.create(*args, **kwargs)
+        object.__setattr__(obj, "__class__", PipelineAsync)
+        object.__setattr__(obj, "_elect_one_commit", elect_one_commit)
+        object.__setattr__(obj, "_syncwarp_before_commit", syncwarp_before_commit)
+        object.__setattr__(obj, "_elect_one_release", elect_one_release)
+        object.__setattr__(obj, "_syncwarp_before_release", syncwarp_before_release)
+        return obj
+    @dsl_user_op
+    def producer_commit(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineAsyncOg.producer_commit,
+            self,
+            state,
+            self._elect_one_commit,
+            self._syncwarp_before_commit,
+            loc,
+            ip,
+        )
+    @dsl_user_op
+    def consumer_release(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineAsyncOg.consumer_release,
+            self,
+            state,
+            self._elect_one_release,
+            self._syncwarp_before_release,
+            loc,
+            ip,
+        )
+    # _w_index variants inherited from _PipelineIndexPhaseMixin, which delegate
+    # to producer_commit / consumer_release above.
+# ── PipelineCpAsync ──────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineCpAsync(_PipelineIndexPhaseMixin, PipelineCpAsyncOg):
+    _elect_one_release: bool = False
+    _syncwarp_before_release: bool = True
+    @staticmethod
+    def create(
+        *args,
+        elect_one_release: bool = False,
+        syncwarp_before_release: bool = True,
+        **kwargs,
+    ):
+        obj = PipelineCpAsyncOg.create(*args, **kwargs)
+        object.__setattr__(obj, "__class__", PipelineCpAsync)
+        object.__setattr__(obj, "_elect_one_release", elect_one_release)
+        object.__setattr__(obj, "_syncwarp_before_release", syncwarp_before_release)
+        return obj
+    @dsl_user_op
+    def consumer_release(self, state: PipelineState, *, loc=None, ip=None):
+        _call_with_elect_one(
+            PipelineCpAsyncOg.consumer_release,
+            self,
+            state,
+            self._elect_one_release,
+            self._syncwarp_before_release,
+            loc,
+            ip,
         )
+    # _w_index variants inherited from _PipelineIndexPhaseMixin.
+# ── PipelineTmaAsync ────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineTmaAsync(_PipelineIndexPhaseMixin, PipelineTmaAsyncOg):
+    """Override producer_acquire to take in extra_tx_count parameter."""
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        extra_tx_count: int = 0,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
+        if const_expr(extra_tx_count == 0):
+            self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip)
         else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            self.sync_object_full.arrive_and_expect_tx(state.index, tx_count, loc=loc, ip=ip)
+PipelineTmaAsync.create = _override_create(PipelineTmaAsyncOg, PipelineTmaAsync)
+# ── PipelineTmaUmma ─────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineTmaUmma(_PipelineIndexPhaseMixin, PipelineTmaUmmaOg):
+    """Override producer_acquire to take in extra_tx_count parameter."""
     @dsl_user_op
     def producer_acquire(
         state: PipelineState,
         try_acquire_token: Optional[Boolean] = None,
         is_tma_warp: Optional[Boolean] = True,
+        extra_tx_count: int = 0,
         *,
         loc=None,
         ip=None,
     ):
         """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
         """
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if const_expr(extra_tx_count == 0):
+            if_generate(
+                and_(self.is_leader_cta, is_tma_warp),
+                lambda: self.sync_object_full.arrive(
+                    state.index, self.producer_mask, loc=loc, ip=ip
+                ),
+                loc=loc,
+                ip=ip,
+            )
+        else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            if_generate(
+                and_(self.is_leader_cta, is_tma_warp),
+                lambda: self.sync_object_full.arrive_and_expect_tx(
+                    state.index, tx_count, loc=loc, ip=ip
+                ),
+                loc=loc,
+                ip=ip,
+            )
+PipelineTmaUmma.create = _override_create(PipelineTmaUmmaOg, PipelineTmaUmma)
+# ── PipelineUmmaAsync ───────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineUmmaAsync(_PipelineIndexPhaseMixin, PipelineUmmaAsyncOg):
+    pass
+PipelineUmmaAsync.create = _override_create(PipelineUmmaAsyncOg, PipelineUmmaAsync)
+# ── PipelineAsyncUmma ───────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineAsyncUmma(_PipelineIndexPhaseMixin, PipelineAsyncUmmaOg):
+    pass
+PipelineAsyncUmma.create = _override_create(PipelineAsyncUmmaOg, PipelineAsyncUmma)
+# ── PipelineTmaCpAsync ──────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class PipelineTmaCpAsync(_PipelineIndexPhaseMixin, PipelineTmaAsyncOg):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and AsyncThread consumers.
+    Compared to PipelineTmaAsync, producer_acquire gates the full-barrier arrive on is_tma_warp.
+    """
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             is_tma_warp,
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
     @dsl_user_op
     def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
+        """We need the mbarrier to track the completion of cp.async."""
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+PipelineTmaCpAsync.create = _override_create(PipelineTmaAsyncOg, PipelineTmaCpAsync)
+# ── MbarrierArrayWDropCount ─────────────────────────────────────────────────
 class MbarrierArrayWDropCount(MbarrierArray):
         )
+# ── PipelineTmaCpAsyncUmma ──────────────────────────────────────────────────
 @dataclass(frozen=True)
+class PipelineTmaCpAsyncUmma(PipelineTmaUmmaOg):
     """
     PipelineTmaCpAsync is used for CpAsync + TMA producers and UMMA consumers
     (e.g. Blackwell mainloops)
     """
+    @dsl_user_op
     @staticmethod
     def create(
         *,
         tx_count: int,
         barrier_storage: cute.Pointer = None,
         cta_layout_vmnk: Optional[cute.Layout] = None,
         mcast_mode_mn: tuple[int, int] = (1, 1),
+        defer_sync: bool = False,
+        producer_drop_count: Optional[Int32] = None,
+        loc=None,
+        ip=None,
     ):
+        """Creates and initializes a new PipelineTmaUmma instance.
         :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: int
+        :param producer_group: CooperativeGroup for the producer agent
         :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
         :type consumer_group: CooperativeGroup
         :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
         :type tx_count: int
+        :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer, optional
         :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout, optional
         :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
         :type mcast_mode_mn: tuple[int, int], optional
+        :raises ValueError: If barrier_storage is not a cute.Pointer instance
+        :return: A new PipelineTmaUmma instance configured with the provided parameters
+        :rtype: PipelineTmaUmma
         """
         if not isinstance(barrier_storage, cute.Pointer):
+            raise TypeError(
                 f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
             )
             producer,
             tx_count,
             drop_count=producer_drop_count,
+            loc=loc,
+            ip=ip,
         )
+        sync_object_empty = PipelineTmaUmmaOg._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages,
+            num_stages,
+            consumer,
+            loc=loc,
+            ip=ip,
         )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
             # No mcast mask if not using clusters
             producer_mask = None
             # All threadblocks are leaders if not using clusters
             is_leader_cta = True
         else:
+            producer_mask = PipelineTmaUmmaOg._compute_mcast_arrival_mask(
+                cta_layout_vmnk, mcast_mode_mn, loc=loc, ip=ip
+            )
+            is_leader_cta = PipelineTmaUmmaOg._compute_is_leader_cta(
+                cta_layout_vmnk, loc=loc, ip=ip
+            )
         cta_group = (
             cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0], loc=loc, ip=ip) == 1
             else cute.nvgpu.tcgen05.CtaGroup.TWO
         )
         consumer_mask = producer_mask
+        if not defer_sync:
+            cute.arch.mbarrier_init_fence()
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, loc=loc, ip=ip) == 1:
+                agent_sync(Agent.ThreadBlock)
+            else:
+                agent_sync(Agent.ThreadBlockCluster, is_relaxed=True)
         return PipelineTmaCpAsyncUmma(
             sync_object_full,
         if_generate(
             try_acquire_token is None or try_acquire_token == 0,
             lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
         # This is the difference between this and PipelineTmaAsync: we could have multiple
         # warps calling this, but only 1 warp should do the arrive on the full barrier
         if_generate(
             and_(self.is_leader_cta, is_tma_warp),
             lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
         )
     @dsl_user_op
         """
         We need the mbarrier to track the completion of cp.async
         """
+        cute.arch.cp_async_mbarrier_arrive_noinc(
+            self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip
+        )

build/torch-cuda/quack/reduce.py CHANGED Viewed

@@ -196,9 +196,9 @@ def online_softmax_reduce(
                     )
                 cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
-                max_x_single_warp = cute.make_fragment(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
-                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
                 sum_exp_x_single_warp.fill(0.0)
                 for i in cutlass.range_constexpr(num_iter):
                     idx = lane_idx + i * cute.arch.WARP_SIZE

                     )
                 cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
                 num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+                max_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 max_x_single_warp.fill(-Float32.inf)
+                sum_exp_x_single_warp = cute.make_rmem_tensor(num_iter, Float32)
                 sum_exp_x_single_warp.fill(0.0)
                 for i in cutlass.range_constexpr(num_iter):
                     idx = lane_idx + i * cute.arch.WARP_SIZE

build/torch-cuda/quack/rms_final_reduce.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) 2025-2026, Tri Dao.
+# Given a 2D array of partial squared sums, compute rstd[m] = rsqrt(sum_n(x[m,n]) * scale + eps).
+# This is the second kernel in a gemm_rms fused pipeline where the first GEMM kernel
+# writes per-tile partial sums of squares.
+import math
+from typing import Type
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+import torch
+from ._ops_compat import add_quack_op_namespace_prefix
+from torch import Tensor
+from . import copy_utils as copy_utils
+from .compile_utils import make_fake_tensor as fake_tensor
+from .reduce import row_reduce
+from .reduction_base import ReductionBase
+from .cache_utils import jit_cache
+from .cute_dsl_utils import torch2cute_dtype_map
+class RmsFinalReduce(ReductionBase):
+    """Reduce partial squared sums and compute rstd: rstd[m] = rsqrt(sum_n(x[m,n]) * scale + eps).
+    Inherits from ReductionBase for tiled copy, reduction buffer, and cluster support.
+    """
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int):
+        super().__init__(dtype, N, stage=1)
+    def _threads_per_row(self):
+        N = self.N
+        for limit, threads in [(64, 8), (128, 16), (3072, 32), (6144, 64), (16384, 128)]:
+            if N <= limit:
+                return threads
+        return 256
+    def _set_cluster_n(self):
+        self.cluster_n = 1
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mRstd: cute.Tensor,
+        scale: Float32,
+        eps: Float32,
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.dtype
+        self._set_cluster_n()
+        vecsize = math.gcd(self.N, 128 // self.dtype.width)
+        tiled_copy, tiler_mn, threads_per_row = self._get_tiled_copy(vecsize=vecsize)
+        num_threads = tiled_copy.size
+        self.kernel(mX, mRstd, scale, eps, tiler_mn, tiled_copy, threads_per_row).launch(
+            grid=[cute.ceil_div(mX.shape[0], tiler_mn[0]), 1, 1],
+            block=[num_threads, 1, 1],
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,
+        mRstd: cute.Tensor,
+        scale: Float32,
+        eps: Float32,
+        tiler_mn: cute.Shape,
+        tiled_copy: cute.TiledCopy,
+        threads_per_row: cutlass.Constexpr[int],
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        tv_layout = tiled_copy.layout_tv_tiled
+        smem = cutlass.utils.SmemAllocator()
+        reduction_buffer, mbar_ptr = self._allocate_reduction_buffer_and_mbar(smem, tv_layout)
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        gX = cute.local_tile(mX, tiler_mn, (bidx, 0))
+        cX = cute.local_tile(idX, tiler_mn, (bidx, 0))
+        thr_copy = tiled_copy.get_slice(tidx)
+        tXgX = thr_copy.partition_S(gX)
+        tXcX = thr_copy.partition_S(cX)[(0, None), None, None]
+        tXrX = cute.make_rmem_tensor_like(tXgX)
+        cute.filter_zeros(tXrX).fill(0)
+        is_even_N = const_expr(shape[1] == tiler_mn[1])
+        tXpX = (
+            copy_utils.predicate_k(thr_copy.partition_S(cX), limit=shape[1])
+            if not is_even_N
+            else None
+        )
+        row = tXcX[0][0]
+        if row < shape[0]:
+            copy_utils.copy(tXgX, tXrX, pred=tXpX)
+        x = tXrX.load().to(Float32)
+        sum_x = row_reduce(
+            x,
+            cute.ReductionOp.ADD,
+            threads_per_row,
+            reduction_buffer[None, None, 0],
+            mbar_ptr,
+            init_val=0.0,
+        )
+        rstd = cute.math.rsqrt(sum_x * scale + eps, fastmath=True)
+        if tXcX[0][1] == 0 and row < shape[0]:
+            mRstd[row] = rstd
+@jit_cache
+def _compile_rms_final_reduce(dtype, N):
+    batch_sym = cute.sym_int()
+    div = math.gcd(N, 128 // dtype.width)
+    x_cute = fake_tensor(dtype, (batch_sym, N), div)
+    rstd_cute = fake_tensor(Float32, (batch_sym,))
+    return cute.compile(
+        RmsFinalReduce(dtype, N),
+        x_cute,
+        rstd_cute,
+        Float32(0),  # scale
+        Float32(0),  # eps
+        cute.runtime.make_fake_stream(use_tvm_ffi_env_stream=True),
+        options="--enable-tvm-ffi",
+    )
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("rms_final_reduce_out"),
+    mutates_args=("rstd",),
+    device_types="cuda",
+)
+def _rms_final_reduce_out(
+    x: Tensor,
+    rstd: Tensor,
+    scale: float,
+    eps: float,
+) -> None:
+    """Compute rstd[m] = rsqrt(sum_n(x[m, n]) * scale + eps)."""
+    x_dtype = torch2cute_dtype_map[x.dtype]
+    N = x.shape[1]
+    compiled_fn = _compile_rms_final_reduce(x_dtype, N)
+    compiled_fn(x, rstd, scale, eps)
+@_rms_final_reduce_out.register_fake
+def _rms_final_reduce_out_fake(x, rstd, scale, eps):
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY and not isinstance(x.shape[0], torch.SymInt):
+        x_dtype = torch2cute_dtype_map[x.dtype]
+        _compile_rms_final_reduce(x_dtype, x.shape[1])
+def rms_final_reduce(
+    x: Tensor,  # (M, N) partial squared sums
+    scale: float,  # typically 1.0 / total_columns
+    eps: float = 1e-6,
+) -> Tensor:
+    """Compute rstd[m] = rsqrt(sum_n(x[m, n]) * scale + eps)."""
+    assert x.ndim == 2
+    M = x.shape[0]
+    rstd = torch.empty(M, dtype=torch.float32, device=x.device)
+    from .cache_utils import COMPILE_ONLY
+    if COMPILE_ONLY:
+        return rstd
+    _rms_final_reduce_out(x, rstd, scale, eps)
+    return rstd