fix: rename stale references and clean up Triton remnants

- Fix broken imports in benchmarks (grouped_fused_mul_poly_norm → fused_mul_grouped_poly_norm)
- Rename GroupedTritonModule → GroupedCUDAModule in benchmarks
- Rename _run_triton → _run_cuda in tests, update docstrings
- Remove stale Triton comments in tests
- Fix return type annotations in __init__.py (None → torch.Tensor)
- Update README with new function name
- Remove TRITON_PRINT_AUTOTUNING env var from profile_bwd.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

README.md +1 -1
benchmarks/cases/grouped_mul_poly.py +21 -8
benchmarks/profile_bwd.py +146 -0
tests/test_fused_mul_grouped_poly_norm.py +9 -11
torch-ext/activation/__init__.py +4 -4

README.md CHANGED Viewed

@@ -58,7 +58,7 @@ Activation is a python package that contains custom CUDA-based activation kernel
     - Fused as:
       ```python
-      out = grouped_fused_mul_poly_norm(x, mul, weight, bias, offsets, eps)
       ```
 ## Usage

     - Fused as:
       ```python
+      out = fused_mul_grouped_poly_norm(x, mul, weight, bias, offsets, eps)
       ```
 ## Usage

benchmarks/cases/grouped_mul_poly.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import torch
 import torch._functorch.config
 from common.diff_engine import DiffCase
 torch._functorch.config.donated_buffer = False
 from grouped_poly_norm import (
-    grouped_fused_mul_poly_norm,
-    grouped_fused_mul_poly_norm_ref,
 )
-NUM_EXPERTS = 384
 class GroupedRefModule(torch.nn.Module):
@@ -24,13 +28,13 @@ class GroupedRefModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
-        return grouped_fused_mul_poly_norm_ref(x, mul, self.weight, self.bias,
                                                self.offsets, self.eps,
                                                expert_offset=self.expert_offset)
-class GroupedTritonModule(torch.nn.Module):
-    """Wraps the Triton kernel for grouped FusedMulPolyNorm."""
     def __init__(self, weight, bias, offsets, eps, expert_offset=0):
         super().__init__()
@@ -41,7 +45,7 @@ class GroupedTritonModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
-        return grouped_fused_mul_poly_norm(x, mul, self.weight, self.bias,
                                            self.offsets, self.eps,
                                            expert_offset=self.expert_offset)
@@ -105,13 +109,22 @@ class GroupedMulPoly(DiffCase):
         return torch.compile(m)
     def make_cuda(self, I):
-        return GroupedTritonModule(
             I["weight"].detach().clone(),
             I["bias"].detach().clone(),
             I["offsets"],
             I["eps"],
         )
     def forward(self, obj, I):
         return obj(I["x"], I["mul"])

 import torch
 import torch._functorch.config
+import torch._inductor
 from common.diff_engine import DiffCase
 torch._functorch.config.donated_buffer = False
 from grouped_poly_norm import (
+    fused_mul_grouped_poly_norm,
+    fused_mul_grouped_poly_norm_ref,
 )
+# 384 / 8 (EP) = 48 experts per rank
+# total_tokens = bs * sl, which equals per-rank tokens
+# since top_k=8 and EP=8, each rank sees all tokens once
+NUM_EXPERTS = 48
 class GroupedRefModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
+        return fused_mul_grouped_poly_norm_ref(x, mul, self.weight, self.bias,
                                                self.offsets, self.eps,
                                                expert_offset=self.expert_offset)
+class GroupedCUDAModule(torch.nn.Module):
+    """Wraps the CUDA kernel for grouped FusedMulPolyNorm."""
     def __init__(self, weight, bias, offsets, eps, expert_offset=0):
         super().__init__()
         self.expert_offset = expert_offset
     def forward(self, x, mul):
+        return fused_mul_grouped_poly_norm(x, mul, self.weight, self.bias,
                                            self.offsets, self.eps,
                                            expert_offset=self.expert_offset)
         return torch.compile(m)
     def make_cuda(self, I):
+        return GroupedCUDAModule(
             I["weight"].detach().clone(),
             I["bias"].detach().clone(),
             I["offsets"],
             I["eps"],
         )
+    def make_compiled_cuda(self, I):
+        m = GroupedCUDAModule(
+            I["weight"].detach().clone(),
+            I["bias"].detach().clone(),
+            I["offsets"],
+            I["eps"],
+        )
+        return torch.compile(m)
     def forward(self, obj, I):
         return obj(I["x"], I["mul"])

benchmarks/profile_bwd.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""Profiling script for grouped polynorm backward kernel using torch.profiler."""
+import argparse
+import torch
+import torch.cuda
+from torch.profiler import profile, ProfilerActivity
+from grouped_poly_norm import fused_mul_grouped_poly_norm
+torch.set_default_device("cuda")
+def make_inputs(N, D, num_experts):
+    torch.manual_seed(42)
+    probs = torch.ones(num_experts) / num_experts
+    assignments = torch.multinomial(probs, N, replacement=True)
+    counts = torch.bincount(assignments, minlength=num_experts).tolist()
+    offsets = torch.cumsum(
+        torch.tensor(counts, dtype=torch.int32), dim=0)
+    x = torch.randn(N, D, dtype=torch.bfloat16, requires_grad=True) * 0.5
+    m = torch.randn(N, D, dtype=torch.bfloat16, requires_grad=True) * 0.5
+    w = (torch.ones(num_experts, 3, dtype=torch.bfloat16) / 3
+         ).requires_grad_(True)
+    b = (torch.randn(num_experts, 1, dtype=torch.bfloat16) * 0.01
+         ).requires_grad_(True)
+    return x, m, w, b, offsets
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tokens", type=int, default=4096)
+    parser.add_argument("--dim", type=int, default=1280)
+    parser.add_argument("--experts", type=int, default=48)
+    parser.add_argument("--output", type=str, default="/tmp/profile")
+    args = parser.parse_args()
+    N, D, num_experts = args.tokens, args.dim, args.experts
+    # Warmup (fresh inputs each time to avoid graph reuse issues)
+    for _ in range(3):
+        x, m, w, b, offsets = make_inputs(N, D, num_experts)
+        out = fused_mul_grouped_poly_norm(x, m, w, b, offsets)
+        out.sum().backward()
+    torch.cuda.synchronize()
+    # Profiled: mimic do_bench — forward once, backward multiple times with retain_graph
+    x, m, w, b, offsets = make_inputs(N, D, num_experts)
+    out = fused_mul_grouped_poly_norm(x, m, w, b, offsets)
+    gin = [x, m] + [w, b]
+    g = [torch.randn_like(out)]
+    # Warmup backward
+    for _ in range(5):
+        torch.autograd.grad(out, gin, g, retain_graph=True, allow_unused=True)
+    torch.cuda.synchronize()
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        record_shapes=True,
+        with_stack=True,
+    ) as prof:
+        for _ in range(100):
+            torch.autograd.grad(out, gin, g, retain_graph=True, allow_unused=True)
+        torch.cuda.synchronize()
+    # Print kernel-level stats
+    print(f"\n=== Kernel Table (N={N}, D={D}) ===")
+    print(prof.key_averages().table(
+        sort_by="cuda_time_total", row_limit=20))
+    # Export chrome trace
+    trace_path = f"{args.output}_trace_N{N}.json"
+    prof.export_chrome_trace(trace_path)
+    print(f"\nTrace exported to {trace_path}")
+    # === Occupancy analysis from Triton kernel metadata ===
+    print(f"\n=== Occupancy Analysis ===")
+    props = torch.cuda.get_device_properties(0)
+    print(f"GPU: {props.name}")
+    print(f"SMs: {props.multi_processor_count}")
+    print(f"Max threads/SM: {props.max_threads_per_multi_processor}")
+    print(f"Regs/SM: {props.regs_per_multiprocessor}")
+    print(f"Shared mem/block: {props.shared_memory_per_block} bytes")
+    # Get register info from Triton compiled cubins
+    try:
+        import glob
+        import json
+        import subprocess
+        cache_dir = os.path.expanduser("~/.triton/cache")
+        # Find metadata JSON files
+        json_files = sorted(glob.glob(f"{cache_dir}/**/*.json", recursive=True),
+                            key=os.path.getmtime, reverse=True)
+        print(f"\nFound {len(json_files)} compiled kernel metadata files")
+        for jf in json_files[:10]:
+            try:
+                with open(jf) as f:
+                    meta = json.load(f)
+                if isinstance(meta, dict):
+                    n_regs = meta.get('num_regs', meta.get('n_regs', None))
+                    n_spills = meta.get('num_spills', meta.get('n_spills', None))
+                    name = meta.get('name', os.path.basename(jf))
+                    shared = meta.get('shared', None)
+                    if n_regs is not None:
+                        print(f"  {name}: regs={n_regs}, spills={n_spills}, shared={shared}")
+            except Exception:
+                pass
+        # Also try cuobjdump on recent cubins
+        cubin_files = sorted(glob.glob(f"{cache_dir}/**/*.cubin", recursive=True),
+                             key=os.path.getmtime, reverse=True)
+        print(f"\nFound {len(cubin_files)} cubins, inspecting latest:")
+        for cb in cubin_files[:5]:
+            try:
+                result = subprocess.run(
+                    ["cuobjdump", "-res-usage", cb],
+                    capture_output=True, text=True, timeout=5)
+                if result.returncode == 0 and result.stdout.strip():
+                    print(f"\n  {os.path.basename(cb)}:")
+                    for line in result.stdout.strip().split('\n'):
+                        print(f"    {line}")
+            except Exception as e:
+                print(f"  cuobjdump failed: {e}")
+                break
+    except Exception as e:
+        print(f"Cache inspection error: {e}")
+    # Calculate theoretical occupancy for different register counts
+    print("\n=== Theoretical Occupancy (num_warps=4, 128 threads/block) ===")
+    threads_per_block = 128
+    max_threads = props.max_threads_per_multi_processor
+    total_regs = props.regs_per_multiprocessor
+    for n_regs in [64, 96, 128, 160, 192, 224, 256]:
+        regs_per_block = n_regs * threads_per_block
+        max_blocks_by_regs = total_regs // regs_per_block
+        max_blocks_by_threads = max_threads // threads_per_block
+        blocks = min(max_blocks_by_regs, max_blocks_by_threads, 32)
+        active_threads = blocks * threads_per_block
+        occupancy = active_threads / max_threads * 100
+        print(f"  {n_regs:3d} regs/thread -> {blocks:2d} blocks/SM -> "
+              f"{active_threads:4d} threads -> {occupancy:.1f}% occupancy")
+if __name__ == "__main__":
+    main()

tests/test_fused_mul_grouped_poly_norm.py CHANGED Viewed

@@ -17,8 +17,6 @@ D = [256, 1280]
 NUM_EXPERTS_LIST = [8, 384]
 EXPERT_OFFSETS = [0, 4]
 SEEDS = [0]
-# Triton kernels launch on the current CUDA device and do not
-# auto-dispatch to the tensor's device like CUDA extensions.
 # Only test on cuda:0 to avoid cross-device issues.
 CUDA_DEVICES = ["cuda:0"]
@@ -77,9 +75,9 @@ def _run_ref(input_t, mul_t, weight, bias, offsets, expert_offset=0,
     return grads + (s.grad,) if s is not None else grads + (None,)
-def _run_triton(input_t, mul_t, weight, bias, offsets, expert_offset=0,
                 scores=None, hidden_clamp=None):
-    """Run Triton/CUDA forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
@@ -112,7 +110,7 @@ def test_fused_mul_grouped_poly_norm_forward(
     seed: int,
     device: str,
 ) -> None:
-    """Triton forward output should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         num_tokens, d, num_experts, dtype, device, seed,
@@ -151,7 +149,7 @@ def test_fused_mul_grouped_poly_norm_backward(
     seed: int,
     device: str,
 ) -> None:
-    """Triton backward gradients should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         num_tokens, d, num_experts, dtype, device, seed,
@@ -159,7 +157,7 @@ def test_fused_mul_grouped_poly_norm_backward(
     _, inp_grad_ref, mul_grad_ref, w_grad_ref, b_grad_ref, _ = _run_ref(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
-    _, inp_grad_tri, mul_grad_tri, w_grad_tri, b_grad_tri, _ = _run_triton(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
     if dtype == torch.float32:
@@ -213,7 +211,7 @@ def test_fused_mul_grouped_poly_norm_zero_token_experts(
     _, _, _, w_grad_ref, b_grad_ref, _ = _run_ref(input_t, mul_t, weight, bias,
                                                    offsets,
                                                    expert_offset=expert_offset)
-    _, _, _, w_grad_tri, b_grad_tri, _ = _run_triton(input_t, mul_t, weight, bias,
                                                       offsets,
                                                       expert_offset=expert_offset)
@@ -250,7 +248,7 @@ def test_fused_mul_grouped_poly_norm_no_nan_inf(
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         4096, 256, 8, dtype, device, expert_offset=expert_offset)
-    out, inp_grad, mul_grad, w_grad, b_grad, _ = _run_triton(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
     assert not out.isnan().any(), "Output contains NaN"
@@ -306,7 +304,7 @@ def test_fused_mul_grouped_poly_norm_scores_backward(
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
         input_t, mul_t, weight, bias, offsets, scores=scores)
-    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_triton(
         input_t, mul_t, weight, bias, offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
@@ -372,7 +370,7 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_backward(
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
         input_t, mul_t, weight, bias, offsets,
         scores=scores, hidden_clamp=hidden_clamp)
-    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_triton(
         input_t, mul_t, weight, bias, offsets,
         scores=scores, hidden_clamp=hidden_clamp)

 NUM_EXPERTS_LIST = [8, 384]
 EXPERT_OFFSETS = [0, 4]
 SEEDS = [0]
 # Only test on cuda:0 to avoid cross-device issues.
 CUDA_DEVICES = ["cuda:0"]
     return grads + (s.grad,) if s is not None else grads + (None,)
+def _run_cuda(input_t, mul_t, weight, bias, offsets, expert_offset=0,
                 scores=None, hidden_clamp=None):
+    """Run CUDA forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
     seed: int,
     device: str,
 ) -> None:
+    """CUDA forward output should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         num_tokens, d, num_experts, dtype, device, seed,
     seed: int,
     device: str,
 ) -> None:
+    """CUDA backward gradients should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         num_tokens, d, num_experts, dtype, device, seed,
     _, inp_grad_ref, mul_grad_ref, w_grad_ref, b_grad_ref, _ = _run_ref(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
+    _, inp_grad_tri, mul_grad_tri, w_grad_tri, b_grad_tri, _ = _run_cuda(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
     if dtype == torch.float32:
     _, _, _, w_grad_ref, b_grad_ref, _ = _run_ref(input_t, mul_t, weight, bias,
                                                    offsets,
                                                    expert_offset=expert_offset)
+    _, _, _, w_grad_tri, b_grad_tri, _ = _run_cuda(input_t, mul_t, weight, bias,
                                                       offsets,
                                                       expert_offset=expert_offset)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
         4096, 256, 8, dtype, device, expert_offset=expert_offset)
+    out, inp_grad, mul_grad, w_grad, b_grad, _ = _run_cuda(
         input_t, mul_t, weight, bias, offsets, expert_offset=expert_offset)
     assert not out.isnan().any(), "Output contains NaN"
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
         input_t, mul_t, weight, bias, offsets, scores=scores)
+    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(
         input_t, mul_t, weight, bias, offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
         input_t, mul_t, weight, bias, offsets,
         scores=scores, hidden_clamp=hidden_clamp)
+    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(
         input_t, mul_t, weight, bias, offsets,
         scores=scores, hidden_clamp=hidden_clamp)

torch-ext/activation/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ def poly_norm(
     weight: torch.Tensor,
     bias: torch.Tensor,
     eps: float = 1e-6,
-) -> None:
     return PolyNormFunction.apply(x, weight, bias, eps)
@@ -22,7 +22,7 @@ def fused_mul_poly_norm(
     weight: torch.Tensor,
     bias: torch.Tensor,
     eps: float = 1e-6,
-) -> None:
     return FusedMulPolyNormFunction.apply(x, mul, weight, bias, eps)
@@ -30,7 +30,7 @@ def rms_norm(
     x: torch.Tensor,
     weight: torch.Tensor,
     eps: float = 1e-6,
-) -> None:
     return RMSNormFunction.apply(x, weight, eps)
@@ -39,7 +39,7 @@ def fused_add_rms_norm(
     residual: torch.Tensor,
     weight: torch.Tensor,
     eps: float = 1e-6,
-) -> None:
     return FusedAddRMSNormFunction.apply(x, residual, weight, eps)

     weight: torch.Tensor,
     bias: torch.Tensor,
     eps: float = 1e-6,
+) -> torch.Tensor:
     return PolyNormFunction.apply(x, weight, bias, eps)
     weight: torch.Tensor,
     bias: torch.Tensor,
     eps: float = 1e-6,
+) -> torch.Tensor:
     return FusedMulPolyNormFunction.apply(x, mul, weight, bias, eps)
     x: torch.Tensor,
     weight: torch.Tensor,
     eps: float = 1e-6,
+) -> torch.Tensor:
     return RMSNormFunction.apply(x, weight, eps)
     residual: torch.Tensor,
     weight: torch.Tensor,
     eps: float = 1e-6,
+) -> torch.Tensor:
     return FusedAddRMSNormFunction.apply(x, residual, weight, eps)