Add torch.compile, CUDA graph, and compiled momentum [skip-build]

- Newton-Schulz: per-shape torch.compile caching + CUDA graph replay
- Batched momentum: separately compiled nesterov/non-nesterov functions
- Batched Newton-Schulz for MoE experts (bmm/baddbmm)
- Triton matmul_transpose cleanup
- Inline uneven shard handling, remove small_param_numel_threshold
- Raise dynamo recompile_limit for test suite

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (6) hide show

test/conftest.py +5 -0
torch-ext/optimizer/core.py +75 -27
torch-ext/optimizer/distributed/utils.py +0 -6
torch-ext/optimizer/matmul_transpose_triton.py +11 -10
torch-ext/optimizer/newton_schulz.py +72 -0
torch-ext/optimizer/qk_clip.py +12 -8

test/conftest.py CHANGED Viewed

@@ -9,6 +9,11 @@ from transformers import AutoModelForCausalLM
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 SEED = 0xdeadbeef

 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
+# Raise dynamo recompile limit so that compiled momentum (batch_pre_ortho)
+# does not fall back to eager mode when the test suite runs 30+ model
+# configurations with different tensor shapes in a single process.
+torch._dynamo.config.recompile_limit = 64
 SEED = 0xdeadbeef

torch-ext/optimizer/core.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
 import math
 from dataclasses import dataclass
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
@@ -31,26 +31,71 @@ class _muon_state:
     qk_clip_state: torch.Tensor | None = None
-def update_g(optimizer_state, p, g, group, momentum):
-    """Apply momentum update to gradient.
-    Args:
-        optimizer_state: The optimizer's state dict (self.state in Muon).
-        p: Parameter tensor.
-        g: Gradient tensor.
-        group: Parameter group dict.
-        momentum: Momentum coefficient.
-    Returns:
-        Momentum-updated gradient tensor.
     """
-    state = optimizer_state[p]
-    buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
-    torch.add(g, buf, alpha=momentum, out=buf)
-    if group["nesterov"]:
-        g.add_(buf, alpha=momentum)
-        return g
-    return buf
 def update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -63,14 +108,13 @@ def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
-    if isinstance(p, torch.nn.Parameter):
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    else:
-        p.mul_(1 - lr * weight_decay)
-        p.add_(u, alpha=-adjusted_lr)
 def adjust_lr_for_muon(lr, param_shape):
@@ -147,7 +191,7 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
-    non_muon_params = []
     for n, p in model.named_parameters():
         if not p.requires_grad:
@@ -157,6 +201,10 @@ def get_default_muon_param_groups(model, is_muon_func=None, expert_keys=None):
             muon_names.append(n)
         else:
             non_muon_params.append(p)
     return [
         {

 import logging
 import math
 from dataclasses import dataclass
+from typing import List
 import torch
 from torch.distributed import ProcessGroup
 from torch.distributed.tensor import DTensor
     qk_clip_state: torch.Tensor | None = None
+def _batch_momentum(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update (no nesterov)."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+def _batch_momentum_nesterov(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+) -> None:
+    """Batched momentum update with nesterov correction."""
+    torch._foreach_mul_(momentum_bufs, momentum)
+    torch._foreach_add_(momentum_bufs, grads)
+    nesterov_terms = torch._foreach_mul(momentum_bufs, momentum)
+    torch._foreach_add_(grads, nesterov_terms)
+_compiled_momentum: dict[bool, callable] = {}
+_use_momentum_compile = True
+def set_momentum_compile(enabled: bool):
+    """Toggle torch.compile for batched momentum."""
+    global _use_momentum_compile
+    _use_momentum_compile = enabled
+def batch_pre_ortho(
+    grads: List[torch.Tensor],
+    momentum_bufs: List[torch.Tensor],
+    momentum: torch.Tensor,
+    nesterov: bool,
+) -> None:
+    """Batched momentum update on lists of plain tensors.
+    Mirrors dion's ``muon_update_pre_orthogonalize``.
+    Inputs must be plain CUDA tensors (not DTensor).
+    Modifies ``momentum_bufs`` and (for nesterov) ``grads`` in-place.
+    When compile is enabled, uses separately compiled functions for
+    nesterov=True/False to avoid graph breaks from the branch.
     """
+    fn = _batch_momentum_nesterov if nesterov else _batch_momentum
+    if _use_momentum_compile:
+        if nesterov not in _compiled_momentum:
+            _compiled_momentum[nesterov] = torch.compile(fn)
+        fn = _compiled_momentum[nesterov]
+    fn(grads, momentum_bufs, momentum)
+def _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay):
+    """Weight-decay + update on plain tensors.
+    Not compiled: per-param @torch.compile caused ~0.25ms TorchDynamo cache
+    lookup per call × 256+ params = massive overhead.  The pipeline path uses
+    batched _foreach_* ops instead; this function remains for base() and
+    distributed_muon().
+    """
+    p_data.mul_(1 - lr * weight_decay)
+    p_data.add_(u_data, alpha=-adjusted_lr)
 def update_p(p, u, lr, adjusted_lr, weight_decay):
         adjusted_lr: Size-adjusted learning rate.
         weight_decay: Weight decay coefficient.
     """
+    # Unwrap Parameter -> underlying data tensor.
+    p_data = p.data if isinstance(p, torch.nn.Parameter) else p
+    # Unwrap DTensor -> local CUDA tensor for compiled kernel.
+    if isinstance(p_data, DTensor):
+        p_data = p_data._local_tensor
+    u_data = u._local_tensor if isinstance(u, DTensor) else u
+    _update_p_impl(p_data, u_data, lr, adjusted_lr, weight_decay)
 def adjust_lr_for_muon(lr, param_shape):
         is_muon_func = lambda n, x: default_is_muon(n, x, expert_keys)
     muon_params, muon_names = [], []
+    non_muon_params, non_muon_names = [], []
     for n, p in model.named_parameters():
         if not p.requires_grad:
             muon_names.append(n)
         else:
             non_muon_params.append(p)
+            non_muon_names.append(n)
+    logger.info("[param_groups] expert_keys=%s, Muon=%d, AdamW=%d",
+                expert_keys, len(muon_names), len(non_muon_names))
     return [
         {

torch-ext/optimizer/distributed/utils.py CHANGED Viewed

@@ -72,12 +72,6 @@ def get_slices_of_dtensor(
         else:
             curr_size = target.size()[shard_dim]
-        if curr_size % num_chunks != 0:
-            raise NotImplementedError(
-                f"Dimension size {curr_size} is not divisible "
-                f"by number of ranks {num_chunks} for shard "
-                f"placement on dim {shard_dim}. (shape: {target.shape})")
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

         else:
             curr_size = target.size()[shard_dim]
         # Compute indices for this level of sharding
         if isinstance(placement, _StridedShard):
             _shard_size, offsets = _StridedShard.local_shard_size_and_offset(

torch-ext/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -43,6 +43,7 @@ def get_autotune_config():
 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
@@ -102,16 +103,10 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
-def matmul_transpose_assign(d_in, d_out):
-    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
-    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
-    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
-    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
-    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
-    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
-    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
-            "First dimension of `d_in` must match first and second dimension of `d_out`"
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
@@ -119,3 +114,9 @@ def matmul_transpose_assign(d_in, d_out):
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))

 @triton.autotune(
     configs=get_autotune_config(),
     key=['M', 'K'],
+    restore_value=['y'],
 )
 @triton.jit
 def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
         tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+@torch.library.custom_op("muon::matmul_transpose_assign",
+                         mutates_args=("d_out", ))
+def matmul_transpose_assign(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """Compute d_out = d_in @ d_in.T using an optimized Triton kernel."""
     d_in = d_in.contiguous()
     M, K = d_in.shape
     grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
     with torch.cuda.device(d_in.device.index):
         mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
                          d_out.stride(0), d_out.stride(1))
+@matmul_transpose_assign.register_fake
+def _(d_in: torch.Tensor, d_out: torch.Tensor) -> None:
+    """FakeTensor impl: d_out is already allocated, mutation is declared."""
+    pass

torch-ext/optimizer/newton_schulz.py CHANGED Viewed

@@ -162,3 +162,75 @@ def _zeropower_via_newtonschulz5(G, steps):
         X = X.T
     return X

         X = X.T
     return X
+@torch.no_grad()
+def _zeropower_via_newtonschulz5_batched(G, steps):
+    """Batched polar factor computation for 3D (E, out, in) tensors.
+    Same algorithm as ``_zeropower_via_newtonschulz5`` but uses
+    ``torch.bmm`` / ``torch.baddbmm`` instead of the 2D Triton kernel,
+    processing all E expert matrices in a single batched call.
+    """
+    assert len(G.shape) == 3
+    assert G.dtype == COMM_DTYPE
+    X = G
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    # Per-expert Frobenius norm.
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    hs = _coeffs_list[:steps] + list(
+        repeat(_coeffs_list[-1], steps - len(_coeffs_list)))
+    for a, b, c in hs:
+        buf1 = torch.bmm(X, X.transpose(-2, -1))
+        buf2 = torch.bmm(buf1, buf1.transpose(-2, -1))
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.baddbmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(1) > G.size(2):
+        X = X.transpose(-2, -1)
+    return X
+_ns_per_shape: dict[tuple[int, ...], callable] = {}
+_use_compile = True
+def set_ns_compile(enabled: bool):
+    """Toggle torch.compile for Newton-Schulz iteration."""
+    global _use_compile
+    _use_compile = enabled
+def zeropower_via_newtonschulz5(G, steps=5):
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(_zeropower_via_newtonschulz5,
+                                           options={
+                                               "triton.cudagraphs": True,
+                                               "shape_padding": False
+                                           })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()
+def zeropower_via_newtonschulz5_batched(G, steps=5):
+    """Compile-cached batched Newton-Schulz for 3D expert tensors."""
+    if not _use_compile:
+        return _zeropower_via_newtonschulz5_batched(G, steps)
+    key = G.shape
+    if key not in _ns_per_shape:
+        _ns_per_shape[key] = torch.compile(
+            _zeropower_via_newtonschulz5_batched,
+            options={
+                "triton.cudagraphs": True,
+                "shape_padding": False
+            })
+    torch.compiler.cudagraph_mark_step_begin()
+    return _ns_per_shape[key](G, steps).clone()

torch-ext/optimizer/qk_clip.py CHANGED Viewed

@@ -102,23 +102,27 @@ def compute_scales(p, qk_clip_state):
     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
-    H_global = p.shape[0] // head_dim
-    scales_full = torch.ones(H_global, device=p.data.device)
-    scaling = 0
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
-            if new_scale < scales_full[head_idx]:
-                scales_full[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
-                scaling += 1
-    return scales_full if scaling > 0 else None
 def qk_clip(p, scales, head_dim):

     threshold = qk_clip_state.threshold
     logit = qk_clip_state.logit
+    # Check if any head exceeds threshold before allocating.
+    head_scales = {}
     for logit_idx, head_idx in enumerate(indices):
         v_ele = float(logit[logit_idx])
         if v_ele > threshold:
             new_scale = math.sqrt(threshold / v_ele)
+            if head_idx not in head_scales or new_scale < head_scales[head_idx]:
+                head_scales[head_idx] = new_scale
                 logger.info(
                     f"[{kind}] Head {head_idx} exceeded threshold "
                     f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
                 )
+    if not head_scales:
+        return None
+    H_global = p.shape[0] // head_dim
+    scales_full = torch.ones(H_global, device=p.data.device)
+    for head_idx, scale in head_scales.items():
+        scales_full[head_idx] = scale
+    return scales_full
 def qk_clip(p, scales, head_dim):