TaehyunKim commited on Sep 29, 2025

Commit

1a3da4d

unverified ·

2 Parent(s): d65066c e93bd1e

Merge pull request #9 from MotifTechnologies/all2all_gather_scatter

Browse files

Files changed (41) hide show

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +319 -104
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +319 -104
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +319 -104
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +319 -104
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +319 -104
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +319 -104
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +319 -104
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +319 -104
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +319 -104
test/test_muon/muon.py +0 -1
test/test_muon/optimizer +1 -0
test/test_muon/test.py +1 -1
torch-ext/optimizer/matmul_transpose_triton.py +128 -0
torch-ext/optimizer/muon.py +319 -104

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf8b97161714dff91953d26ae0bf59ebc9f3653ce57a3998723cc08aa97b71e6
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:94a28c3602d8c7a6b216976b1fb09cdd1e9f61bfc9359a80f41b5b628efdfc28
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42ae6ac1cf967d7d23cac7930c8db635105f60631220a60b9cee060d082f40ae
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dae71b7e998e72130093a86f8c983c3379510e23525e3cdcd4afe5c21bf4d3db
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
 size 1883344

build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41492cb1479920b654768a5597d88670dd0caeedbdcd73fd63afa31ffc6961d6
 size 1749776

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7cf2f7b8519dbc3f20e9d151914b55e56d10c012e2232d550b7c8d262746d71
 size 1749776

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42ae6ac1cf967d7d23cac7930c8db635105f60631220a60b9cee060d082f40ae
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
 size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb40a06623bb3668b82ff248b5a3c1bcf41e7f3f860888b261505b3a71257bc7
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
 size 1883344

build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dae71b7e998e72130093a86f8c983c3379510e23525e3cdcd4afe5c21bf4d3db
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:6880c22f63ccd66e8ac62792a564d1ade58325b47369a1773c7753d4243893b9
 size 1883344

build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8f845b8df6426eb5db57e4525b8dd3c80004c44759b01a3e39cc37a817813b5
 size 1749936

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae22a3afdffd54435c6e5b145fc0b7772d03eb8c8bad0d388d9b2d1c8d2f60d5
 size 1749936

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_9c21645_dirty
-ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_9c21645_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_9c21645_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a477575e3cc30e54d355b3e778240dc25fb0dab30362f3540dc5f925ac03ba1
 size 1750024

 version https://git-lfs.github.com/spec/v1
+oid sha256:8092bc6ee3e353b2188f0874bc7f145e4eafd0366a40da9750c225732961f7c7
 size 1750024

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

test/test_muon/muon.py DELETED Viewed

	@@ -1 +0,0 @@
1	- ../../torch-ext/optimizer/muon.py

test/test_muon/optimizer ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../torch-ext/optimizer/

test/test_muon/test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import torch
 import torch.distributed as dist
-from muon import Muon, get_default_muon_param_groups
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.placement_types import Replicate

 import torch
 import torch.distributed as dist
+from optimizer.muon import Muon, get_default_muon_param_groups
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.placement_types import Replicate

torch-ext/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -8,14 +8,19 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -27,13 +32,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,13 +49,10 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
@@ -69,51 +73,142 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
     """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +222,145 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
     """
-    Scatter the computed_u from worker_rank to all ranks.
     """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
         else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -321,6 +516,11 @@ class Muon(torch.optim.Optimizer):
                     "head_dim": 128,
                     "threshold": 100
                 }
     """
     def __init__(self,
@@ -339,7 +539,8 @@ class Muon(torch.optim.Optimizer):
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
-                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -363,15 +564,13 @@ class Muon(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -444,11 +643,18 @@ class Muon(torch.optim.Optimizer):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
@@ -478,7 +684,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
@@ -493,15 +699,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -585,11 +788,17 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +806,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +828,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
     """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
     """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
     """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
         else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
                     "head_dim": 128,
                     "threshold": 100
                 }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
     """
     def __init__(self,
                      "k_indices": [],
                      "head_dim": 128,
                      "threshold": 100
+                 },
+                 overlap_step=5):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         super().__init__(params, defaults)
+        self.rank = None
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
         self.clip_config = clip_config
+        self.overlap_step = overlap_step
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
+                local_rank = dist.get_rank(group=process_group)
+                if self.rank is None:
+                    self.rank = dist.get_rank(group=process_group)
+                else:
+                    assert self.rank == local_rank
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
             param_to_state[id(p)].process_group = process_group
             qk_clip_state = self.get_qk_clip_info(n, qk_logits)
             param_to_state[id(p)].qk_clip_state = qk_clip_state
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)