refactor: remove Triton kernels, add hidden_clamp to unscored ops

- Remove all Triton kernel code (fwd/bwd kernels, autotune configs,
triton import) — replaced by CUDA kernels in grouped_poly_norm.cu
- Add hidden_clamp parameter to unscored C++ ops (forward/backward)
so both scored and unscored paths support clamping
- Update register_fake, autograd Function, and dispatch for unscored ops
- Replace HAS_TRITON with _has_cuda_ops in tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

activation/grouped_poly_norm.cu +6 -4
tests/test_fused_mul_grouped_poly_norm.py +10 -10
torch-ext/activation/grouped_poly_norm.py +29 -401
torch-ext/torch_binding.cpp +3 -3
torch-ext/torch_binding.h +5 -3

activation/grouped_poly_norm.cu CHANGED Viewed

@@ -609,8 +609,9 @@ std::tuple<torch::Tensor, torch::Tensor>
 grouped_poly_norm_forward(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
-    const torch::Tensor &offsets, double eps, int64_t expert_offset) {
-  return _fwd_impl(input, mul, weight, bias, offsets, nullptr, eps, expert_offset, -1.0);
 }
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
@@ -618,11 +619,12 @@ grouped_poly_norm_backward(
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
-    const torch::Tensor &inv_rms, double eps, int64_t expert_offset) {
   const int64_t N = input.size(0);
   auto [ig, mg, wg, bg, _] = _bwd_impl(
       grad_output, input, mul, weight, bias, offsets, inv_rms,
-      nullptr, nullptr, N, eps, expert_offset, -1.0);
   return {ig, mg, wg, bg};
 }

 grouped_poly_norm_forward(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, double eps, int64_t expert_offset,
+    double hidden_clamp) {
+  return _fwd_impl(input, mul, weight, bias, offsets, nullptr, eps, expert_offset, hidden_clamp);
 }
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, double eps, int64_t expert_offset,
+    double hidden_clamp) {
   const int64_t N = input.size(0);
   auto [ig, mg, wg, bg, _] = _bwd_impl(
       grad_output, input, mul, weight, bias, offsets, inv_rms,
+      nullptr, nullptr, N, eps, expert_offset, hidden_clamp);
   return {ig, mg, wg, bg};
 }

tests/test_fused_mul_grouped_poly_norm.py CHANGED Viewed

@@ -2,11 +2,11 @@ import pytest
 import torch
 from grouped_poly_norm import (
-    HAS_TRITON,
     fused_mul_grouped_poly_norm_ref,
 )
-if HAS_TRITON:
     from grouped_poly_norm import fused_mul_grouped_poly_norm
 from .utils import assert_close
@@ -95,7 +95,7 @@ def _run_triton(input_t, mul_t, weight, bias, offsets, expert_offset=0,
     return grads + (s.grad,) if s is not None else grads + (None,)
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS_LIST)
@@ -134,7 +134,7 @@ def test_fused_mul_grouped_poly_norm_forward(
         assert_close(out_ref, out_tri, atol=1e-2, rtol=1e-2)
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS_LIST)
@@ -173,7 +173,7 @@ def test_fused_mul_grouped_poly_norm_backward(
     assert_close(b_grad_ref, b_grad_tri, atol=atol, rtol=rtol)
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("expert_offset", EXPERT_OFFSETS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -236,7 +236,7 @@ def test_fused_mul_grouped_poly_norm_zero_token_experts(
             f"but got max={b_grad_tri[wi].abs().max().item()}")
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("expert_offset", EXPERT_OFFSETS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -265,7 +265,7 @@ def test_fused_mul_grouped_poly_norm_no_nan_inf(
 # ---------------------------------------------------------------------------
 # Scores tests
 # ---------------------------------------------------------------------------
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", [8, 48])
@@ -289,7 +289,7 @@ def test_fused_mul_grouped_poly_norm_scores_forward(
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", [8, 48])
@@ -326,7 +326,7 @@ def test_fused_mul_grouped_poly_norm_scores_backward(
 CLAMP_VALUES = [10.0, 1.0, 0.5]
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", [4096])
 @pytest.mark.parametrize("d", [256, 1280])
 @pytest.mark.parametrize("num_experts", [8])
@@ -353,7 +353,7 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_forward(
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
-@pytest.mark.skipif(not HAS_TRITON, reason="Triton not available")
 @pytest.mark.parametrize("num_tokens", [4096])
 @pytest.mark.parametrize("d", [256, 1280])
 @pytest.mark.parametrize("num_experts", [8])

 import torch
 from grouped_poly_norm import (
+    _has_cuda_ops,
     fused_mul_grouped_poly_norm_ref,
 )
+if _has_cuda_ops:
     from grouped_poly_norm import fused_mul_grouped_poly_norm
 from .utils import assert_close
     return grads + (s.grad,) if s is not None else grads + (None,)
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS_LIST)
         assert_close(out_ref, out_tri, atol=1e-2, rtol=1e-2)
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS_LIST)
     assert_close(b_grad_ref, b_grad_tri, atol=atol, rtol=rtol)
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("expert_offset", EXPERT_OFFSETS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
             f"but got max={b_grad_tri[wi].abs().max().item()}")
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("expert_offset", EXPERT_OFFSETS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 # ---------------------------------------------------------------------------
 # Scores tests
 # ---------------------------------------------------------------------------
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", [8, 48])
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("num_experts", [8, 48])
 CLAMP_VALUES = [10.0, 1.0, 0.5]
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", [4096])
 @pytest.mark.parametrize("d", [256, 1280])
 @pytest.mark.parametrize("num_experts", [8])
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
+@pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
 @pytest.mark.parametrize("num_tokens", [4096])
 @pytest.mark.parametrize("d", [256, 1280])
 @pytest.mark.parametrize("num_experts", [8])

torch-ext/activation/grouped_poly_norm.py CHANGED Viewed

@@ -1,49 +1,26 @@
-"""Triton-accelerated Grouped FusedMulPolyNorm for MoE.
-Fuses the entire PolyNorm computation into two Triton kernels (fwd + bwd),
-eliminating multiple intermediate tensors and kernel launches.
 PolyNorm formula (per row):
     poly = w[0] * rms_norm(x^3) + w[1] * rms_norm(x^2) + w[2] * rms_norm(x) + bias
-    output = poly * mul
 where rms_norm(x) = x / sqrt(mean(x^2, dim=-1) + eps)
-Performance optimizations:
-  - @triton.autotune selects optimal BLOCK_D, num_warps, and num_stages per
-    hidden dimension.
-  - Single-tile specialization: when D <= BLOCK_D, all data stays in registers
-    across the reduction and output phases, eliminating redundant global reads.
-  - Multi-tile software pipelining: explicit num_stages in autotune configs
-    enables overlapping memory loads with computation across loop iterations.
-  - In-kernel binary search for expert mapping: eliminates 2 PyTorch kernel
-    launches (torch.arange + torch.bucketize) per forward/backward call.
-  - Backward 2-pass optimization: pass 1 merges RMS statistics computation
-    with dot product accumulation, pass 2 computes gradients. This reduces
-    memory traffic compared to a naive 3-pass approach.
-Forward kernel: one program per row, tiles over D dimension.
-  - Computes x, x^2, x^3 in registers
-  - Computes three RMS norms in a single pass (shared variance reduction)
-  - Applies polynomial weights + bias + mul in-place
-Backward kernel: one program per row, tiles over D dimension.
-  - Recomputes forward intermediates from saved inputs (activation recomputation)
-  - 2-pass: (1) RMS stats + dot products + bias grad, (2) grad_input + grad_mul + weight grads
-  - Weight/bias gradients use tl.atomic_add for cross-row accumulation
 """
 import torch
 from torch import Tensor
-try:
-    import triton
-    import triton.language as tl
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
 # Try to load CUDA ops at module level
 _ops = None
 try:
@@ -61,14 +38,15 @@ _has_cuda_ops = _ops is not None and hasattr(_ops, 'grouped_poly_norm_forward')
 if _has_cuda_ops:
     try:
         @torch.library.register_fake("_activation::grouped_poly_norm_forward")
-        def _fwd_fake(input, mul, weight, bias, offsets, eps, expert_offset):
             return (torch.empty_like(input),
                     torch.empty(input.shape[0], 3, dtype=torch.float32,
                                 device=input.device))
         @torch.library.register_fake("_activation::grouped_poly_norm_backward")
         def _bwd_fake(grad_output, input, mul, weight, bias, offsets, inv_rms,
-                       eps, expert_offset):
             return (torch.empty_like(input),
                     torch.empty_like(mul),
                     torch.empty_like(weight),
@@ -164,383 +142,32 @@ def fused_mul_grouped_poly_norm_ref(
 # ---------------------------------------------------------------------------
-# Triton kernel implementation
 # ---------------------------------------------------------------------------
-if HAS_TRITON:
-    # --- Autotune configurations ---
-    _GROUPED_POLYNORM_FWD_CONFIGS = [
-        triton.Config({"BLOCK_D": 128}, num_warps=2, num_stages=2),
-        triton.Config({"BLOCK_D": 128}, num_warps=4, num_stages=3),
-        triton.Config({"BLOCK_D": 256}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_D": 256}, num_warps=4, num_stages=3),
-        triton.Config({"BLOCK_D": 256}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_D": 256}, num_warps=8, num_stages=4),
-        triton.Config({"BLOCK_D": 512}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_D": 512}, num_warps=4, num_stages=3),
-        triton.Config({"BLOCK_D": 512}, num_warps=4, num_stages=4),
-        triton.Config({"BLOCK_D": 512}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_D": 512}, num_warps=8, num_stages=3),
-        triton.Config({"BLOCK_D": 512}, num_warps=8, num_stages=4),
-        triton.Config({"BLOCK_D": 512}, num_warps=16, num_stages=2),
-        triton.Config({"BLOCK_D": 1024}, num_warps=8, num_stages=2),
-        triton.Config({"BLOCK_D": 1024}, num_warps=8, num_stages=3),
-        triton.Config({"BLOCK_D": 1024}, num_warps=16, num_stages=2),
-        triton.Config({"BLOCK_D": 2048}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_D": 2048}, num_warps=8, num_stages=1),
-        triton.Config({"BLOCK_D": 2048}, num_warps=16, num_stages=1),
-        triton.Config({"BLOCK_D": 2048}, num_warps=32, num_stages=1),
-    ]
-    _GROUPED_POLYNORM_BWD_CONFIGS = [
-        # Low-warp configs for high SM occupancy (latency hiding)
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 1}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 1}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 2}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 2}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 4}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 4}, num_warps=4, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 8}, num_warps=2, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 8}, num_warps=4, num_stages=1),
-        # Medium-warp configs
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 1}, num_warps=8, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 2}, num_warps=8, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 4}, num_warps=8, num_stages=1),
-        triton.Config({"BLOCK_D": 2048, "BLOCK_N": 8}, num_warps=8, num_stages=1),
-        # Multi-tile configs (BLOCK_D=1024 for D=1280 -> 2 tiles, no mask waste)
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 1}, num_warps=2, num_stages=2),
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 1}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 2}, num_warps=2, num_stages=2),
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 2}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 4}, num_warps=4, num_stages=2),
-        triton.Config({"BLOCK_D": 1024, "BLOCK_N": 8}, num_warps=4, num_stages=2),
-    ]
-    @triton.autotune(
-        configs=_GROUPED_POLYNORM_FWD_CONFIGS,
-        key=["D"],
-    )
-    @triton.jit
-    def _grouped_polynorm_fwd_kernel(
-        input_ptr,
-        mul_ptr,
-        weight_ptr,
-        bias_ptr,
-        offsets_ptr,
-        output_ptr,
-        inv_rms_ptr,
-        N,
-        D,
-        num_experts,
-        eps,
-        expert_offset,
-        stride_input_row,
-        stride_mul_row,
-        stride_out_row,
-        BLOCK_D: tl.constexpr,
-    ):
-        """Forward kernel: one program per row. Saves inv_rms for backward."""
-        row = tl.program_id(0)
-        if row >= N:
-            return
-        # Binary search for expert index (12 iters covers up to 4096 experts)
-        lo = 0
-        hi = num_experts
-        for _ in range(12):
-            if lo < hi:
-                mid = (lo + hi) // 2
-                if tl.load(offsets_ptr + mid) <= row:
-                    lo = mid + 1
-                else:
-                    hi = mid
-        eidx = lo + expert_offset
-        w0 = tl.load(weight_ptr + eidx * 3 + 0).to(tl.float32)
-        w1 = tl.load(weight_ptr + eidx * 3 + 1).to(tl.float32)
-        w2 = tl.load(weight_ptr + eidx * 3 + 2).to(tl.float32)
-        b = tl.load(bias_ptr + eidx).to(tl.float32)
-        input_row_ptr = input_ptr + row * stride_input_row
-        mul_row_ptr = mul_ptr + row * stride_mul_row
-        out_row_ptr = output_ptr + row * stride_out_row
-        D_float = D.to(tl.float32)
-        # --- Single-tile path ---
-        if D <= BLOCK_D:
-            d_offs = tl.arange(0, BLOCK_D)
-            mask = d_offs < D
-            x = tl.load(input_row_ptr + d_offs, mask=mask,
-                         other=0.0).to(tl.float32)
-            m = tl.load(mul_row_ptr + d_offs, mask=mask,
-                         other=0.0).to(tl.float32)
-            x2 = x * x
-            x3 = x2 * x
-            inv_rms_x = 1.0 / tl.sqrt(tl.sum(x2) / D_float + eps)
-            inv_rms_x2 = 1.0 / tl.sqrt(tl.sum(x2 * x2) / D_float + eps)
-            inv_rms_x3 = 1.0 / tl.sqrt(tl.sum(x3 * x3) / D_float + eps)
-            # Save inv_rms for backward
-            tl.store(inv_rms_ptr + row * 3 + 0, inv_rms_x)
-            tl.store(inv_rms_ptr + row * 3 + 1, inv_rms_x2)
-            tl.store(inv_rms_ptr + row * 3 + 2, inv_rms_x3)
-            # Pre-multiply scalar weight * inv_rms to save 1 FMA per element
-            w0_inv = w0 * inv_rms_x3
-            w1_inv = w1 * inv_rms_x2
-            w2_inv = w2 * inv_rms_x
-            poly = x3 * w0_inv + x2 * w1_inv + x * w2_inv + b
-            tl.store(out_row_ptr + d_offs, poly * m, mask=mask)
-        else:
-            # --- Multi-tile: two-pass approach ---
-            sum_x2 = tl.zeros((), dtype=tl.float32)
-            sum_x4 = tl.zeros((), dtype=tl.float32)
-            sum_x6 = tl.zeros((), dtype=tl.float32)
-            for d_start in range(0, D, BLOCK_D):
-                d_offs = d_start + tl.arange(0, BLOCK_D)
-                mask = d_offs < D
-                x = tl.load(input_row_ptr + d_offs, mask=mask,
-                             other=0.0).to(tl.float32)
-                x2 = x * x
-                x3 = x2 * x
-                sum_x2 += tl.sum(x2)
-                sum_x4 += tl.sum(x2 * x2)
-                sum_x6 += tl.sum(x3 * x3)
-            inv_rms_x = 1.0 / tl.sqrt(sum_x2 / D_float + eps)
-            inv_rms_x2 = 1.0 / tl.sqrt(sum_x4 / D_float + eps)
-            inv_rms_x3 = 1.0 / tl.sqrt(sum_x6 / D_float + eps)
-            # Save inv_rms for backward
-            tl.store(inv_rms_ptr + row * 3 + 0, inv_rms_x)
-            tl.store(inv_rms_ptr + row * 3 + 1, inv_rms_x2)
-            tl.store(inv_rms_ptr + row * 3 + 2, inv_rms_x3)
-            # Pre-multiply scalar weight * inv_rms
-            w0_inv = w0 * inv_rms_x3
-            w1_inv = w1 * inv_rms_x2
-            w2_inv = w2 * inv_rms_x
-            for d_start in range(0, D, BLOCK_D):
-                d_offs = d_start + tl.arange(0, BLOCK_D)
-                mask = d_offs < D
-                x = tl.load(input_row_ptr + d_offs, mask=mask,
-                             other=0.0).to(tl.float32)
-                m = tl.load(mul_row_ptr + d_offs, mask=mask,
-                             other=0.0).to(tl.float32)
-                x2 = x * x
-                x3 = x2 * x
-                poly = x3 * w0_inv + x2 * w1_inv + x * w2_inv + b
-                tl.store(out_row_ptr + d_offs, poly * m, mask=mask)
-    @triton.jit
-    def _grouped_polynorm_bwd_kernel(
-        grad_out_ptr,
-        input_ptr,
-        mul_ptr,
-        weight_ptr,
-        bias_ptr,
-        offsets_ptr,
-        inv_rms_ptr,
-        grad_input_ptr,
-        grad_mul_ptr,
-        grad_weight_ptr,
-        grad_bias_ptr,
-        N,
-        D,
-        num_experts,
-        eps,
-        expert_offset,
-        stride_row,
-        BLOCK_D: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        """Backward kernel: BLOCK_N rows per program. Loads saved inv_rms.
-        Each program processes BLOCK_N consecutive rows. Since MoE tokens
-        are sorted by expert, consecutive rows often share the same expert,
-        allowing weight/bias load reuse and amortized binary search.
-        """
-        pid = tl.program_id(0)
-        row_start = pid * BLOCK_N
-        D_float = D.to(tl.float32)
-        d_offs = tl.arange(0, BLOCK_D)
-        d_mask = d_offs < D
-        for row_off in tl.static_range(BLOCK_N):
-            row = row_start + row_off
-            if row < N:
-                # Binary search for expert index
-                lo = 0
-                hi = num_experts
-                for _ in range(12):
-                    if lo < hi:
-                        mid = (lo + hi) // 2
-                        if tl.load(offsets_ptr + mid) <= row:
-                            lo = mid + 1
-                        else:
-                            hi = mid
-                eidx = lo + expert_offset
-                w0 = tl.load(weight_ptr + eidx * 3 + 0).to(tl.float32)
-                w1 = tl.load(weight_ptr + eidx * 3 + 1).to(tl.float32)
-                w2 = tl.load(weight_ptr + eidx * 3 + 2).to(tl.float32)
-                b_val = tl.load(bias_ptr + eidx).to(tl.float32)
-                input_row_ptr = input_ptr + row * stride_row
-                mul_row_ptr = mul_ptr + row * stride_row
-                grad_out_row_ptr = grad_out_ptr + row * stride_row
-                grad_input_row_ptr = grad_input_ptr + row * stride_row
-                grad_mul_row_ptr = grad_mul_ptr + row * stride_row
-                # --- Single-tile path ---
-                if D <= BLOCK_D:
-                    x = tl.load(input_row_ptr + d_offs, mask=d_mask,
-                                 other=0.0).to(tl.float32)
-                    m = tl.load(mul_row_ptr + d_offs, mask=d_mask,
-                                 other=0.0).to(tl.float32)
-                    go = tl.load(grad_out_row_ptr + d_offs, mask=d_mask,
-                                  other=0.0).to(tl.float32)
-                    x2 = x * x
-                    x3 = x2 * x
-                    # Load saved inv_rms from forward
-                    inv_rms_x = tl.load(inv_rms_ptr + row * 3 + 0)
-                    inv_rms_x2 = tl.load(inv_rms_ptr + row * 3 + 1)
-                    inv_rms_x3 = tl.load(inv_rms_ptr + row * 3 + 2)
-                    w0_inv = w0 * inv_rms_x3
-                    w1_inv = w1 * inv_rms_x2
-                    w2_inv = w2 * inv_rms_x
-                    dpoly = go * m
-                    sum_dpoly_x = tl.sum(dpoly * x)
-                    sum_dpoly_x2 = tl.sum(dpoly * x2)
-                    sum_dpoly_x3 = tl.sum(dpoly * x3)
-                    grad_b_acc = tl.sum(dpoly)
-                    grad_w0_acc = inv_rms_x3 * sum_dpoly_x3
-                    grad_w1_acc = inv_rms_x2 * sum_dpoly_x2
-                    grad_w2_acc = inv_rms_x * sum_dpoly_x
-                    coeff_x = w2 * sum_dpoly_x * inv_rms_x * inv_rms_x / D_float
-                    coeff_x2 = w1 * sum_dpoly_x2 * inv_rms_x2 * inv_rms_x2 / D_float
-                    coeff_x3 = w0 * sum_dpoly_x3 * inv_rms_x3 * inv_rms_x3 / D_float
-                    # grad_mul
-                    poly = x3 * w0_inv + x2 * w1_inv + x * w2_inv
-                    tl.store(grad_mul_row_ptr + d_offs, go * (poly + b_val),
-                             mask=d_mask)
-                    # grad_input
-                    g = inv_rms_x * (w2 * dpoly - x * coeff_x)
-                    g += 2.0 * x * inv_rms_x2 * (w1 * dpoly - x2 * coeff_x2)
-                    g += 3.0 * x2 * inv_rms_x3 * (w0 * dpoly - x3 * coeff_x3)
-                    tl.store(grad_input_row_ptr + d_offs, g, mask=d_mask)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 0, grad_w0_acc)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 1, grad_w1_acc)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 2, grad_w2_acc)
-                    tl.atomic_add(grad_bias_ptr + eidx, grad_b_acc)
-                else:
-                    # --- Multi-tile: dot products pass ---
-                    # Load saved inv_rms from forward
-                    inv_rms_x = tl.load(inv_rms_ptr + row * 3 + 0)
-                    inv_rms_x2 = tl.load(inv_rms_ptr + row * 3 + 1)
-                    inv_rms_x3 = tl.load(inv_rms_ptr + row * 3 + 2)
-                    sum_dpoly_x = tl.zeros((), dtype=tl.float32)
-                    sum_dpoly_x2 = tl.zeros((), dtype=tl.float32)
-                    sum_dpoly_x3 = tl.zeros((), dtype=tl.float32)
-                    grad_b_acc = tl.zeros((), dtype=tl.float32)
-                    for d_start in range(0, D, BLOCK_D):
-                        tile_offs = d_start + d_offs
-                        tile_mask = tile_offs < D
-                        x = tl.load(input_row_ptr + tile_offs, mask=tile_mask,
-                                     other=0.0).to(tl.float32)
-                        m = tl.load(mul_row_ptr + tile_offs, mask=tile_mask,
-                                     other=0.0).to(tl.float32)
-                        go = tl.load(grad_out_row_ptr + tile_offs,
-                                     mask=tile_mask, other=0.0).to(tl.float32)
-                        x2 = x * x
-                        x3 = x2 * x
-                        dpoly = go * m
-                        sum_dpoly_x += tl.sum(dpoly * x)
-                        sum_dpoly_x2 += tl.sum(dpoly * x2)
-                        sum_dpoly_x3 += tl.sum(dpoly * x3)
-                        grad_b_acc += tl.sum(dpoly)
-                    w0_inv = w0 * inv_rms_x3
-                    w1_inv = w1 * inv_rms_x2
-                    w2_inv = w2 * inv_rms_x
-                    grad_w0_acc = inv_rms_x3 * sum_dpoly_x3
-                    grad_w1_acc = inv_rms_x2 * sum_dpoly_x2
-                    grad_w2_acc = inv_rms_x * sum_dpoly_x
-                    coeff_x = w2 * sum_dpoly_x * inv_rms_x * inv_rms_x / D_float
-                    coeff_x2 = w1 * sum_dpoly_x2 * inv_rms_x2 * inv_rms_x2 / D_float
-                    coeff_x3 = w0 * sum_dpoly_x3 * inv_rms_x3 * inv_rms_x3 / D_float
-                    for d_start in range(0, D, BLOCK_D):
-                        tile_offs = d_start + d_offs
-                        tile_mask = tile_offs < D
-                        x = tl.load(input_row_ptr + tile_offs, mask=tile_mask,
-                                     other=0.0).to(tl.float32)
-                        m = tl.load(mul_row_ptr + tile_offs, mask=tile_mask,
-                                     other=0.0).to(tl.float32)
-                        go = tl.load(grad_out_row_ptr + tile_offs,
-                                     mask=tile_mask, other=0.0).to(tl.float32)
-                        x2 = x * x
-                        x3 = x2 * x
-                        poly = x3 * w0_inv + x2 * w1_inv + x * w2_inv
-                        tl.store(grad_mul_row_ptr + tile_offs,
-                                 go * (poly + b_val), mask=tile_mask)
-                        dpoly = go * m
-                        g = inv_rms_x * (w2 * dpoly - x * coeff_x)
-                        g += (2.0 * x * inv_rms_x2 *
-                              (w1 * dpoly - x2 * coeff_x2))
-                        g += (3.0 * x2 * inv_rms_x3 *
-                              (w0 * dpoly - x3 * coeff_x3))
-                        tl.store(grad_input_row_ptr + tile_offs, g,
-                                 mask=tile_mask)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 0, grad_w0_acc)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 1, grad_w1_acc)
-                    tl.atomic_add(grad_weight_ptr + eidx * 3 + 2, grad_w2_acc)
-                    tl.atomic_add(grad_bias_ptr + eidx, grad_b_acc)
     class _GroupedPolyNormFn(torch.autograd.Function):
         """Without scores — follows poly_norm.py pattern."""
         @staticmethod
-        def forward(input, mul, weight, bias, offsets, eps, expert_offset):
             input = input.contiguous()
             mul = mul.contiguous()
             output, inv_rms = _ops.grouped_poly_norm_forward(
-                input, mul, weight, bias, offsets, eps, expert_offset)
             return output, inv_rms
         @staticmethod
         def setup_context(ctx, inputs, output):
-            input, mul, weight, bias, offsets, eps, expert_offset = inputs
             _, inv_rms = output
             ctx.save_for_backward(input, mul, weight, bias, offsets, inv_rms)
             ctx.eps = eps
             ctx.expert_offset = expert_offset
         @staticmethod
         def backward(ctx, grad_output, _grad_inv_rms):
@@ -548,8 +175,8 @@ if HAS_TRITON:
             grad_output = grad_output.contiguous()
             gi, gm, gw, gb = _ops.grouped_poly_norm_backward(
                 grad_output, input, mul, weight, bias, offsets, inv_rms,
-                ctx.eps, ctx.expert_offset)
-            return gi, gm, gw, gb, None, None, None
     class _GroupedPolyNormScoredFn(torch.autograd.Function):
         """With scores — same pattern, adds scores + hidden_clamp."""
@@ -622,7 +249,8 @@ if HAS_TRITON:
                 expert_offset, clamp_val)
         else:
             output, _ = _GroupedPolyNormFn.apply(
-                input, mul, weight, bias, offsets, eps, expert_offset)
         return output
 else:
@@ -639,5 +267,5 @@ else:
         hidden_clamp: float | None = None,
     ) -> Tensor:
         raise RuntimeError(
-            "Triton is not available. Install triton to use "
-            "fused_mul_grouped_poly_norm.")

+"""Grouped FusedMulPolyNorm for MoE — CUDA kernel with autograd wrappers.
+Fuses the entire PolyNorm computation into CUDA kernels (fwd + bwd),
+with optional scores multiplication and hidden_clamp fusion.
 PolyNorm formula (per row):
     poly = w[0] * rms_norm(x^3) + w[1] * rms_norm(x^2) + w[2] * rms_norm(x) + bias
+    output = poly * mul * score
+    output = clamp(output, -hidden_clamp, hidden_clamp)  (if enabled)
 where rms_norm(x) = x / sqrt(mean(x^2, dim=-1) + eps)
+CUDA kernel (activation/grouped_poly_norm.cu):
+  - Vectorized loads (width=8 for bf16/fp16, width=4 for fp32)
+  - In-kernel binary search for expert mapping
+  - 2-pass forward (RMS stats + output), 2-pass backward (dot products + grads)
+  - Scores and hidden_clamp fused in-kernel (no extra kernel launches)
+  - Weight/bias gradients via atomicAdd
 """
 import torch
 from torch import Tensor
 # Try to load CUDA ops at module level
 _ops = None
 try:
 if _has_cuda_ops:
     try:
         @torch.library.register_fake("_activation::grouped_poly_norm_forward")
+        def _fwd_fake(input, mul, weight, bias, offsets, eps, expert_offset,
+                       hidden_clamp):
             return (torch.empty_like(input),
                     torch.empty(input.shape[0], 3, dtype=torch.float32,
                                 device=input.device))
         @torch.library.register_fake("_activation::grouped_poly_norm_backward")
         def _bwd_fake(grad_output, input, mul, weight, bias, offsets, inv_rms,
+                       eps, expert_offset, hidden_clamp):
             return (torch.empty_like(input),
                     torch.empty_like(mul),
                     torch.empty_like(weight),
 # ---------------------------------------------------------------------------
+# CUDA kernel autograd functions
 # ---------------------------------------------------------------------------
+if _has_cuda_ops:
     class _GroupedPolyNormFn(torch.autograd.Function):
         """Without scores — follows poly_norm.py pattern."""
         @staticmethod
+        def forward(input, mul, weight, bias, offsets, eps, expert_offset,
+                    hidden_clamp):
             input = input.contiguous()
             mul = mul.contiguous()
             output, inv_rms = _ops.grouped_poly_norm_forward(
+                input, mul, weight, bias, offsets, eps, expert_offset,
+                hidden_clamp)
             return output, inv_rms
         @staticmethod
         def setup_context(ctx, inputs, output):
+            (input, mul, weight, bias, offsets, eps, expert_offset,
+             hidden_clamp) = inputs
             _, inv_rms = output
             ctx.save_for_backward(input, mul, weight, bias, offsets, inv_rms)
             ctx.eps = eps
             ctx.expert_offset = expert_offset
+            ctx.hidden_clamp = hidden_clamp
         @staticmethod
         def backward(ctx, grad_output, _grad_inv_rms):
             grad_output = grad_output.contiguous()
             gi, gm, gw, gb = _ops.grouped_poly_norm_backward(
                 grad_output, input, mul, weight, bias, offsets, inv_rms,
+                ctx.eps, ctx.expert_offset, ctx.hidden_clamp)
+            return gi, gm, gw, gb, None, None, None, None
     class _GroupedPolyNormScoredFn(torch.autograd.Function):
         """With scores — same pattern, adds scores + hidden_clamp."""
                 expert_offset, clamp_val)
         else:
             output, _ = _GroupedPolyNormFn.apply(
+                input, mul, weight, bias, offsets, eps, expert_offset,
+                clamp_val)
         return output
 else:
         hidden_clamp: float | None = None,
     ) -> Tensor:
         raise RuntimeError(
+            "CUDA ops not available. Build with setup.py or kernel-builder "
+            "to use fused_mul_grouped_poly_norm.")

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -49,18 +49,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
-  // grouped_poly_norm (without scores)
   ops.def("grouped_poly_norm_forward("
           "Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, "
-          "float eps, int expert_offset) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward", torch::kCUDA,
            &grouped_poly_norm_forward);
   ops.def("grouped_poly_norm_backward("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, "
-          "float eps, int expert_offset) -> (Tensor, Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward", torch::kCUDA,
            &grouped_poly_norm_backward);

   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
+  // grouped_poly_norm (without scores, hidden_clamp < 0 = disabled)
   ops.def("grouped_poly_norm_forward("
           "Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward", torch::kCUDA,
            &grouped_poly_norm_forward);
   ops.def("grouped_poly_norm_backward("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward", torch::kCUDA,
            &grouped_poly_norm_backward);

torch-ext/torch_binding.h CHANGED Viewed

@@ -36,19 +36,21 @@ std::tuple<torch::Tensor, torch::Tensor> fused_add_rms_norm_backward(
     const torch::Tensor &input, const torch::Tensor &weight, double eps,
     bool need_input_grad);
-// Without scores
 std::tuple<torch::Tensor, torch::Tensor>
 grouped_poly_norm_forward(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
-    const torch::Tensor &offsets, double eps, int64_t expert_offset);
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 grouped_poly_norm_backward(
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
-    const torch::Tensor &inv_rms, double eps, int64_t expert_offset);
 // With scores (hidden_clamp < 0 = disabled)
 std::tuple<torch::Tensor, torch::Tensor>

     const torch::Tensor &input, const torch::Tensor &weight, double eps,
     bool need_input_grad);
+// Without scores (hidden_clamp < 0 = disabled)
 std::tuple<torch::Tensor, torch::Tensor>
 grouped_poly_norm_forward(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, double eps, int64_t expert_offset,
+    double hidden_clamp);
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 grouped_poly_norm_backward(
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, double eps, int64_t expert_offset,
+    double hidden_clamp);
 // With scores (hidden_clamp < 0 = disabled)
 std::tuple<torch::Tensor, torch::Tensor>